From: Amir E. Aharoni Date: Sat, 26 Sep 2015 20:28:03 +0000 (+0300) Subject: Make grammar data loadable as an RL module and usable in JS X-Git-Tag: 1.31.0-rc.0~5052^2 X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/exercices/journal.php?a=commitdiff_plain;h=df5a848de8fcc4a5bf4046ec83a8b0f10b6f8a2d;p=lhc%2Fweb%2Fwiklou.git Make grammar data loadable as an RL module and usable in JS * Load the data of this variable from a JSON file to the same data structure that ResourceLoader uses for digitTransformTable, pluralRules, etc. * Change the JSON structure to ensure the order of the rules. Otherwise JavaScript processes the keys in a random order. * Delete the grammar code from JS and replace it with the same logic that is used in PHP for processing the data. For now this is done only for Russian. The next step will be to make the PHP and JS data processing logic reusable. Bug: T115217 Change-Id: I6b9b29b7017f958d62611671be017f97cee73415 --- diff --git a/includes/resourceloader/ResourceLoaderLanguageDataModule.php b/includes/resourceloader/ResourceLoaderLanguageDataModule.php index 1630269101..ef942faf20 100644 --- a/includes/resourceloader/ResourceLoaderLanguageDataModule.php +++ b/includes/resourceloader/ResourceLoaderLanguageDataModule.php @@ -41,6 +41,7 @@ class ResourceLoaderLanguageDataModule extends ResourceLoaderModule { 'digitTransformTable' => $language->digitTransformTable(), 'separatorTransformTable' => $language->separatorTransformTable(), 'grammarForms' => $language->getGrammarForms(), + 'grammarTransformations' => $language->getGrammarTransformations(), 'pluralRules' => $language->getPluralRules(), 'digitGroupingPattern' => $language->digitGroupingPattern(), 'fallbackLanguages' => $language->getFallbackLanguages(), diff --git a/languages/Language.php b/languages/Language.php index 7ef2effb98..4628812bab 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -137,6 +137,12 @@ class Language { */ static private $fallbackLanguageCache = []; + /** + * Cache for grammar rules data + * @var MapCacheLRU|null + */ + static private $grammarTransformations; + /** * Cache for language names * @var HashBagOStuff|null @@ -3730,6 +3736,7 @@ class Language { return $word; } + /** * Get the grammar forms for the content language * @return array Array of grammar forms @@ -3745,6 +3752,45 @@ class Language { return []; } + + /** + * Get the grammar transformations data for the language. + * Used like grammar forms, with {{GRAMMAR}} and cases, + * but uses pairs of regexes and replacements instead of code. + * + * @return array[] Array of grammar transformations. + * @since 1.28 + */ + public function getGrammarTransformations() { + $languageCode = $this->getCode(); + + if ( self::$grammarTransformations === null ) { + self::$grammarTransformations = new MapCacheLRU( 10 ); + } + + if ( self::$grammarTransformations->has( $languageCode ) ) { + return self::$grammarTransformations->get( $languageCode ); + } + + $data = []; + + $grammarDataFile = __DIR__ . "/data/grammarTransformations/$languageCode.json"; + if ( is_readable( $grammarDataFile ) ) { + $data = FormatJson::decode( + file_get_contents( $grammarDataFile ), + true + ); + if ( $data === null ) { + throw new MWException( "Invalid grammar data for \"$languageCode\"." ); + $data = []; + } + + self::$grammarTransformations->set( $languageCode, $data ); + } + + return $data; + } + /** * Provides an alternative text depending on specified gender. * Usage {{gender:username|masculine|feminine|unknown}}. diff --git a/languages/classes/LanguageRu.php b/languages/classes/LanguageRu.php index c2560a468b..62de39051d 100644 --- a/languages/classes/LanguageRu.php +++ b/languages/classes/LanguageRu.php @@ -31,7 +31,6 @@ * @ingroup Language */ class LanguageRu extends Language { - /** * Convert from the nominative form of a noun to some other case * Invoked with {{grammar:case|word}} @@ -46,19 +45,22 @@ class LanguageRu extends Language { return $wgGrammarForms['ru'][$case][$word]; } - $grammarDataFile = __DIR__ . '/data/grammar.ru.json'; - $grammarData = FormatJson::decode( file_get_contents( $grammarDataFile ), true ); + $grammarTransformations = $this->getGrammarTransformations(); + + if ( isset( $grammarTransformations[$case] ) ) { + foreach ( array_values( $grammarTransformations[$case] ) as $rule ) { + $form = $rule[0]; - if ( array_key_exists( $case, $grammarData ) ) { - foreach ( array_keys( $grammarData[$case] ) as $form ) { if ( $form === '@metadata' ) { continue; } + $replacement = $rule[1]; + $regex = "/$form/"; if ( preg_match( $regex, $word ) ) { - $word = preg_replace( $regex, $grammarData[$case][$form], $word ); + $word = preg_replace( $regex, $replacement, $word ); break; } diff --git a/languages/classes/data/grammar.ru.json b/languages/classes/data/grammar.ru.json deleted file mode 100644 index 446163b7a6..0000000000 --- a/languages/classes/data/grammar.ru.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "@metadata": { - "authors": [ - "Alexander Sigachov (alexander.sigachov at Googgle Mail)", - "Amir E. Aharoni (amir.aharoni@mail.huji.ac.il)" - ], - "comment": "These rules don't cover the whole grammar of the language, and are intended only for names of languages and Wikimedia projects." - }, - "genitive": { - "(.+)ь$": "$1я", - "(.+)ия$": "$1ии", - "(.+)ка$": "$1ки", - "(.+)ти$": "$1тей", - "(.+)ды$": "$1дов", - "(.+)д$": "$1да", - "(.+)ник$": "$1ника", - "(.+)ные$": "$1ных" - }, - "prepositional": { - "(.+)ь$": "$1е", - "(.+)ия$": "$1ии", - "(.+)ка$": "$1ке", - "(.+)ти$": "$1тях", - "(.+)ды$": "$1дах", - "(.+)д$": "$1де", - "(.+)ник$": "$1нике", - "(.+)ные$": "$1ных" - }, - "languagegen": { - "@metadata": "язык в родительном падеже: '(с) русского'", - "(.+)кий$": "$1кого", - "иврит$": "иврита", - "идиш$": "идиша", - "(.+)$": "$1" - }, - "languageprep": { - "@metadata": "язык в предложном падеже: '(на) русском'", - "(.+)кий$": "$1ком", - "иврит$": "иврите", - "идиш$": "идише", - "(.+)$": "$1" - }, - "languageadverb": { - "@metadata": "наречие с названием языка: 'по-русски'", - "(.+)кий$": "по-$1ки", - "иврит$": "на иврите", - "идиш$": "на идише", - "(идо|урду|хинди|эсперанто)$": "на $1", - "(.+)$": "на языке $1" - } -} diff --git a/languages/data/grammarTransformations/ru.json b/languages/data/grammarTransformations/ru.json new file mode 100644 index 0000000000..deb58b7ef8 --- /dev/null +++ b/languages/data/grammarTransformations/ru.json @@ -0,0 +1,57 @@ +{ + "@metadata": { + "authors": [ + "Alexander Sigachov (alexander.sigachov at Googgle Mail)", + "Amir E. Aharoni (amir.aharoni@mail.huji.ac.il)" + ], + "comment": "These rules don't cover the whole grammar of the language, and are intended only for names of languages and Wikimedia projects." + }, + "genitive": [ + [ "(.+)ь$", "$1я" ], + [ "(.+)ия$", "$1ии" ], + [ "(.+)ка$", "$1ки" ], + [ "(.+)ти$", "$1тей" ], + [ "(.+)ды$", "$1дов" ], + [ "(.+)д$", "$1да" ], + [ "(.+)ник$", "$1ника" ], + [ "(.+)ные$", "$1ных" ] + ], + "prepositional": [ + [ "(.+)ь$", "$1е" ], + [ "(.+)ия$", "$1ии" ], + [ "(.+)ка$", "$1ке" ], + [ "(.+)ти$", "$1тях" ], + [ "(.+)ды$", "$1дах" ], + [ "(.+)д$", "$1де" ], + [ "(.+)ник$", "$1нике" ], + [ "(.+)ные$", "$1ных" ] + ], + "languagegen": [ + [ "@metadata", [ + "comment", "язык в родительном падеже: '(с) русского'" + ] ], + [ "(.+)кий$", "$1кого" ], + [ "иврит$", "иврита" ], + [ "идиш$", "идиша" ], + [ "(.+)$", "$1" ] + ], + "languageprep": [ + [ "@metadata", [ + "comment", "язык в предложном падеже: '(на) русском'" + ] ], + [ "(.+)кий$", "$1ком" ], + [ "иврит$", "иврите" ], + [ "идиш$", "идише" ], + [ "(.+)$", "$1" ] + ], + "languageadverb": [ + [ "@metadata", [ + "comment", "наречие с названием языка: 'по-русски'" + ] ], + [ "(.+)кий$", "по-$1ки" ], + [ "иврит$", "на иврите" ], + [ "идиш$", "на идише" ], + [ "(идо|урду|хинди|эсперанто)$", "на $1" ], + [ "(.+)$", "на языке $1" ] + ] +} diff --git a/resources/src/mediawiki.language/languages/ru.js b/resources/src/mediawiki.language/languages/ru.js index ccc68f1e47..09d7c0b518 100644 --- a/resources/src/mediawiki.language/languages/ru.js +++ b/resources/src/mediawiki.language/languages/ru.js @@ -2,82 +2,37 @@ * Russian (Русский) language functions */ -// These tests were originally made for names of Wikimedia -// websites, so they don't currently cover all the possible -// cases. - mediaWiki.language.convertGrammar = function ( word, form ) { - /*global $ */ 'use strict'; - var grammarForms = mediaWiki.language.getData( 'ru', 'grammarForms' ); - if ( grammarForms && grammarForms[ form ] ) { - return grammarForms[ form ][ word ]; + var forms, transformations, i, rule, sourcePattern, regexp, replacement; + + forms = mediaWiki.language.getData( 'ru', 'grammarForms' ); + if ( forms && forms[ form ] ) { + return forms[ form ][ word ]; + } + + transformations = mediaWiki.language.getData( 'ru', 'grammarTransformations' ); + + if ( !transformations[ form ] ) { + return word; } - switch ( form ) { - case 'genitive': // родительный падеж - if ( word.slice( -1 ) === 'ь' ) { - word = word.slice( 0, -1 ) + 'я'; - } else if ( word.slice( -2 ) === 'ия' ) { - word = word.slice( 0, -2 ) + 'ии'; - } else if ( word.slice( -2 ) === 'ка' ) { - word = word.slice( 0, -2 ) + 'ки'; - } else if ( word.slice( -2 ) === 'ти' ) { - word = word.slice( 0, -2 ) + 'тей'; - } else if ( word.slice( -2 ) === 'ды' ) { - word = word.slice( 0, -2 ) + 'дов'; - } else if ( word.slice( -1 ) === 'д' ) { - word = word.slice( 0, -1 ) + 'да'; - } else if ( word.slice( -3 ) === 'ные' ) { - word = word.slice( 0, -3 ) + 'ных'; - } else if ( word.slice( -3 ) === 'ник' ) { - word = word.slice( 0, -3 ) + 'ника'; - } - break; - case 'prepositional': // предложный падеж - if ( word.slice( -1 ) === 'ь' ) { - word = word.slice( 0, -1 ) + 'е'; - } else if ( word.slice( -2 ) === 'ия' ) { - word = word.slice( 0, -2 ) + 'ии'; - } else if ( word.slice( -2 ) === 'ка' ) { - word = word.slice( 0, -2 ) + 'ке'; - } else if ( word.slice( -2 ) === 'ти' ) { - word = word.slice( 0, -2 ) + 'тях'; - } else if ( word.slice( -2 ) === 'ды' ) { - word = word.slice( 0, -2 ) + 'дах'; - } else if ( word.slice( -1 ) === 'д' ) { - word = word.slice( 0, -1 ) + 'де'; - } else if ( word.slice( -3 ) === 'ные' ) { - word = word.slice( 0, -3 ) + 'ных'; - } else if ( word.slice( -3 ) === 'ник' ) { - word = word.slice( 0, -3 ) + 'нике'; - } - break; - case 'languagegen': // язык в родительном падеже ("(с) русского") - if ( word.slice( -3 ) === 'кий' ) { - word = word.slice( 0, -2 ) + 'ого'; - } else if ( $.inArray( word, [ 'иврит', 'идиш' ] ) > -1 ) { - word = word + 'а'; - } - break; - case 'languageprep': // язык в предложном падеже ("(на) русском") - if ( word.slice( -3 ) === 'кий' ) { - word = word.slice( 0, -2 ) + 'ом'; - } else if ( $.inArray( word, [ 'иврит', 'идиш' ] ) > -1 ) { - word = word + 'е'; - } - break; - case 'languageadverb': // наречие с названием языка ("по-русски") - if ( word.slice( -3 ) === 'кий' ) { - word = 'по-' + word.slice( 0, -1 ); - } else if ( $.inArray( word, [ 'иврит', 'идиш' ] ) > -1 ) { - word = 'на ' + word + 'е'; - } else if ( $.inArray( word, [ 'идо', 'урду', 'хинди', 'эсперанто' ] ) > -1 ) { - word = 'на ' + word; - } else { - word = 'на языке ' + word; - } - break; + + for ( i = 0; i < transformations[ form ].length; i++ ) { + rule = transformations[ form ][ i ]; + sourcePattern = rule[ 0 ]; + + if ( sourcePattern === '@metadata' ) { + continue; + } + + regexp = new RegExp( sourcePattern ); + replacement = rule[ 1 ]; + + if ( word.match( regexp ) ) { + return word.replace( regexp, replacement ); + } } + return word; };