From 6b03e2e88ec09b9886c1bf0b177de80b65dc5af5 Mon Sep 17 00:00:00 2001 From: "Amir E. Aharoni" Date: Mon, 28 Sep 2015 13:26:08 +0300 Subject: [PATCH] Make the code for grammar data processing common This makes the code for processing JSON files with grammar transformations reusable by different languages and applies the same logic to Russian and Hebrew. It will be done to other languages in further patches. This patch is not supposed to change any functionality, and the tests are intact (except a comment in the test for Hebrew - the class doesn't exist any longer). PHP: * Move the JSON grammar transformation data processing logic from LanguageRu.php to convertGrammar() in Language.php. By default all these data files are supposed to be processed identically, so the code should be common. If there is no JSON data file, nothing new happens. * LanguageRu's own convertGrammar() method is removed. * The LanguageHe class is removed, now that all its functionality is handled by generic JSON data processing in the Language class. LanguageHe.php file is removed from the repo and from autoloading. JavaScript: * Move the JSON grammar transformation data processing logic from ru.js to mediawiki.language.js. * JavaScript grammar code files he.js and ru.js are removed from the repo and from Resources.php, because all the data is in JSON, and the default logic in mediawiki.language.js works for both languages. Bug: T115217 Change-Id: I5e75467121c3d791bb84f9e6fdfcf07c1840f81a --- autoload.php | 1 - languages/Language.php | 37 ++++++++++ languages/classes/LanguageHe.php | 70 ------------------- languages/classes/LanguageRu.php | 39 ----------- languages/data/grammarTransformations/he.json | 26 +++++++ resources/Resources.php | 2 - .../src/mediawiki.language/languages/he.js | 29 -------- .../src/mediawiki.language/languages/ru.js | 38 ---------- .../mediawiki.language/mediawiki.language.js | 45 ++++++++++-- .../languages/classes/LanguageHeTest.php | 2 +- 10 files changed, 105 insertions(+), 184 deletions(-) delete mode 100644 languages/classes/LanguageHe.php create mode 100644 languages/data/grammarTransformations/he.json delete mode 100644 resources/src/mediawiki.language/languages/he.js delete mode 100644 resources/src/mediawiki.language/languages/ru.js diff --git a/autoload.php b/autoload.php index 6dbcc1d6bd..e1b808a640 100644 --- a/autoload.php +++ b/autoload.php @@ -695,7 +695,6 @@ $wgAutoloadLocalClasses = [ 'LanguageFi' => __DIR__ . '/languages/classes/LanguageFi.php', 'LanguageGa' => __DIR__ . '/languages/classes/LanguageGa.php', 'LanguageGan' => __DIR__ . '/languages/classes/LanguageGan.php', - 'LanguageHe' => __DIR__ . '/languages/classes/LanguageHe.php', 'LanguageHsb' => __DIR__ . '/languages/classes/LanguageHsb.php', 'LanguageHu' => __DIR__ . '/languages/classes/LanguageHu.php', 'LanguageHy' => __DIR__ . '/languages/classes/LanguageHy.php', diff --git a/languages/Language.php b/languages/Language.php index bc5ab7e727..ac8d4cb1d3 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -3737,6 +3737,43 @@ class Language { return $wgGrammarForms[$this->getCode()][$case][$word]; } + $grammarTransformations = $this->getGrammarTransformations(); + + if ( isset( $grammarTransformations[$case] ) ) { + $forms = $grammarTransformations[$case]; + + // Some names of grammar rules are aliases for other rules. + // In such cases the value is a string rather than object, + // so load the actual rules. + if ( is_string( $forms ) ) { + $forms = $grammarTransformations[$forms]; + } + + foreach ( array_values( $forms ) as $rule ) { + $form = $rule[0]; + + if ( $form === '@metadata' ) { + continue; + } + + $replacement = $rule[1]; + + $regex = '/' . addcslashes( $form, '/' ) . '/u'; + $patternMatches = preg_match( $regex, $word ); + + if ( $patternMatches === false ) { + wfLogWarning( + 'An error occurred while processing grammar. ' . + "Word: '$word'. Regex: /$form/." + ); + } elseif ( $patternMatches === 1 ) { + $word = preg_replace( $regex, $replacement, $word ); + + break; + } + } + } + return $word; } diff --git a/languages/classes/LanguageHe.php b/languages/classes/LanguageHe.php deleted file mode 100644 index a6aaf6dbc9..0000000000 --- a/languages/classes/LanguageHe.php +++ /dev/null @@ -1,70 +0,0 @@ - "ת" ) { - $word = "־" . $word; - } - } - - return $word; - } -} diff --git a/languages/classes/LanguageRu.php b/languages/classes/LanguageRu.php index 62de39051d..7b15721a57 100644 --- a/languages/classes/LanguageRu.php +++ b/languages/classes/LanguageRu.php @@ -31,45 +31,6 @@ * @ingroup Language */ class LanguageRu extends Language { - /** - * Convert from the nominative form of a noun to some other case - * Invoked with {{grammar:case|word}} - * - * @param string $word - * @param string $case - * @return string - */ - function convertGrammar( $word, $case ) { - global $wgGrammarForms; - if ( isset( $wgGrammarForms['ru'][$case][$word] ) ) { - return $wgGrammarForms['ru'][$case][$word]; - } - - $grammarTransformations = $this->getGrammarTransformations(); - - if ( isset( $grammarTransformations[$case] ) ) { - foreach ( array_values( $grammarTransformations[$case] ) as $rule ) { - $form = $rule[0]; - - if ( $form === '@metadata' ) { - continue; - } - - $replacement = $rule[1]; - - $regex = "/$form/"; - - if ( preg_match( $regex, $word ) ) { - $word = preg_replace( $regex, $replacement, $word ); - - break; - } - } - } - - return $word; - } - /** * Four-digit number should be without group commas (spaces) * See manual of style at https://ru.wikipedia.org/wiki/Википедия:Оформление_статей diff --git a/languages/data/grammarTransformations/he.json b/languages/data/grammarTransformations/he.json new file mode 100644 index 0000000000..50620b179c --- /dev/null +++ b/languages/data/grammarTransformations/he.json @@ -0,0 +1,26 @@ +{ + "@metadata": { + "authors": [ + "Rotem Liss", + "Amir E. Aharoni (amir.aharoni@mail.huji.ac.il)" + ] + }, + "prefixed": "תחילית", + "תחילית": [ + [ "@metadata", [ + "comment", "הכפלת ו, מחיקת ה הידיעה, הוספת מקף" + ] ], + [ + "^(ו[^ו].+)$", + "ו$1" + ], + [ + "ה(.+)$", + "$1" + ], + [ + "^([^א-ת].+)$", + "־$1" + ] + ] +} diff --git a/resources/Resources.php b/resources/Resources.php index 4932a2949b..d1dd6681f7 100644 --- a/resources/Resources.php +++ b/resources/Resources.php @@ -1592,13 +1592,11 @@ return [ 'dsb' => 'resources/src/mediawiki.language/languages/dsb.js', 'fi' => 'resources/src/mediawiki.language/languages/fi.js', 'ga' => 'resources/src/mediawiki.language/languages/ga.js', - 'he' => 'resources/src/mediawiki.language/languages/he.js', 'hsb' => 'resources/src/mediawiki.language/languages/hsb.js', 'hu' => 'resources/src/mediawiki.language/languages/hu.js', 'hy' => 'resources/src/mediawiki.language/languages/hy.js', 'la' => 'resources/src/mediawiki.language/languages/la.js', 'os' => 'resources/src/mediawiki.language/languages/os.js', - 'ru' => 'resources/src/mediawiki.language/languages/ru.js', 'sl' => 'resources/src/mediawiki.language/languages/sl.js', 'uk' => 'resources/src/mediawiki.language/languages/uk.js', ], diff --git a/resources/src/mediawiki.language/languages/he.js b/resources/src/mediawiki.language/languages/he.js deleted file mode 100644 index 5bf8c4df91..0000000000 --- a/resources/src/mediawiki.language/languages/he.js +++ /dev/null @@ -1,29 +0,0 @@ -/*! - * Hebrew (עברית) language functions - */ - -mediaWiki.language.convertGrammar = function ( word, form ) { - var grammarForms = mediaWiki.language.getData( 'he', 'grammarForms' ); - if ( grammarForms && grammarForms[ form ] ) { - return grammarForms[ form ][ word ]; - } - switch ( form ) { - case 'prefixed': - case 'תחילית': // the same word in Hebrew - // Duplicate prefixed "Waw", but only if it's not already double - if ( word.slice( 0, 1 ) === 'ו' && word.slice( 0, 2 ) !== 'וו' ) { - word = 'ו' + word; - } - - // Remove the "He" if prefixed - if ( word.slice( 0, 1 ) === 'ה' ) { - word = word.slice( 1 ); - } - - // Add a hyphen (maqaf) before numbers and non-Hebrew letters - if ( word.slice( 0, 1 ) < 'א' || word.slice( 0, 1 ) > 'ת' ) { - word = '־' + word; - } - } - return word; -}; diff --git a/resources/src/mediawiki.language/languages/ru.js b/resources/src/mediawiki.language/languages/ru.js deleted file mode 100644 index 09d7c0b518..0000000000 --- a/resources/src/mediawiki.language/languages/ru.js +++ /dev/null @@ -1,38 +0,0 @@ -/*! - * Russian (Русский) language functions - */ - -mediaWiki.language.convertGrammar = function ( word, form ) { - 'use strict'; - - var forms, transformations, i, rule, sourcePattern, regexp, replacement; - - forms = mediaWiki.language.getData( 'ru', 'grammarForms' ); - if ( forms && forms[ form ] ) { - return forms[ form ][ word ]; - } - - transformations = mediaWiki.language.getData( 'ru', 'grammarTransformations' ); - - if ( !transformations[ form ] ) { - return word; - } - - for ( i = 0; i < transformations[ form ].length; i++ ) { - rule = transformations[ form ][ i ]; - sourcePattern = rule[ 0 ]; - - if ( sourcePattern === '@metadata' ) { - continue; - } - - regexp = new RegExp( sourcePattern ); - replacement = rule[ 1 ]; - - if ( word.match( regexp ) ) { - return word.replace( regexp, replacement ); - } - } - - return word; -}; diff --git a/resources/src/mediawiki.language/mediawiki.language.js b/resources/src/mediawiki.language/mediawiki.language.js index fc2af3d0d8..3726a68527 100644 --- a/resources/src/mediawiki.language/mediawiki.language.js +++ b/resources/src/mediawiki.language/mediawiki.language.js @@ -109,7 +109,7 @@ /** * Grammatical transformations, needed for inflected languages. - * Invoked by putting `{{grammar:form|word}}` in a message. + * Invoked by putting `{{grammar:case|word}}` in a message. * * The rules can be defined in $wgGrammarForms global or computed * dynamically by overriding this method per language. @@ -119,10 +119,47 @@ * @return {string} */ convertGrammar: function ( word, form ) { - var grammarForms = mw.language.getData( mw.config.get( 'wgUserLanguage' ), 'grammarForms' ); - if ( grammarForms && grammarForms[ form ] ) { - return grammarForms[ form ][ word ] || word; + var userLanguage, forms, transformations, + patterns, i, rule, sourcePattern, regexp, replacement; + + userLanguage = mw.config.get( 'wgUserLanguage' ); + + forms = mw.language.getData( userLanguage, 'grammarForms' ); + if ( forms && forms[ form ] ) { + return forms[ form ][ word ]; + } + + transformations = mediaWiki.language.getData( userLanguage, 'grammarTransformations' ); + + if ( !( transformations && transformations[ form ] ) ) { + return word; + } + + patterns = transformations[ form ]; + + // Some names of grammar rules are aliases for other rules. + // In such cases the value is a string rather than object, + // so load the actual rules. + if ( typeof patterns === 'string' ) { + patterns = transformations[ patterns ]; } + + for ( i = 0; i < patterns.length; i++ ) { + rule = patterns[ i ]; + sourcePattern = rule[ 0 ]; + + if ( sourcePattern === '@metadata' ) { + continue; + } + + regexp = new RegExp( sourcePattern ); + replacement = rule[ 1 ]; + + if ( word.match( regexp ) ) { + return word.replace( regexp, replacement ); + } + } + return word; }, diff --git a/tests/phpunit/languages/classes/LanguageHeTest.php b/tests/phpunit/languages/classes/LanguageHeTest.php index 771cda5ea3..c1b774af51 100644 --- a/tests/phpunit/languages/classes/LanguageHeTest.php +++ b/tests/phpunit/languages/classes/LanguageHeTest.php @@ -5,7 +5,7 @@ * @file */ -/** Tests for MediaWiki languages/classes/LanguageHe.php */ +/** Tests for MediaWiki Hebrew grammar transformation handling */ class LanguageHeTest extends LanguageClassesTestCase { /** * The most common usage for the plural forms is two forms, -- 2.20.1