Convert Russian grammar cases from PHP to JSON
authorAmir E. Aharoni <amir.aharoni@mail.huji.ac.il>
Mon, 21 Sep 2015 19:57:08 +0000 (22:57 +0300)
committerAmire80 <amir.aharoni@mail.huji.ac.il>
Mon, 28 Sep 2015 14:24:01 +0000 (14:24 +0000)
This is a follow-up to
Ib6a0afa5c3736f8b9b2e121cd752c53ee50fad75

The PHP logic for grammatical cases in Russian was growing.
It was too long and not reusable is JavaScript.

This patch moves all the logic to a JSON file,
indexed by the grammatical case name and then
by regular expressions that match the different
word classes, with the values being the replacements
that should be compatible with common regular expression
replacement functions in modern programming languages.

This patch doesn't introduce any functional changes
and doesn't change any tests.

The next steps, not necessarily in this order, are:
* Make it work also with JavaScript.
* Make JSON grammar data files loadable with ResourceLoader.
* Convert most or all grammar rules for all the languages to JSON.
* Make the data processing loop generic for all languages.
* Convert it also in jquery.i18n (Milkshake).
* Convert the test cases data from code to generic data.
* Move the JSON data to a separate reusable repository.

Change-Id: I0e8e1bfb9d3ec9f841f733356af32dad7d130e94

languages/classes/LanguageRu.php
languages/classes/data/grammar.ru.json [new file with mode: 0644]

index f50640a..60384a8 100644 (file)
@@ -46,128 +46,22 @@ class LanguageRu extends Language {
                        return $wgGrammarForms['ru'][$case][$word];
                }
 
-               # These rules don't cover the whole language, and are intended only for
-               # names of languages and Wikimedia sites.
+               $grammarDataFile = __DIR__ . '/data/grammar.ru.json';
+               $grammarData = FormatJson::decode( file_get_contents( $grammarDataFile ), true );
 
-               # substr doesn't support Unicode and mb_substr has issues,
-               # so break it to characters using preg_match_all and then use array_slice and join
-               $chars = array();
-               preg_match_all( '/./us', $word, $chars );
-               if ( !preg_match( "/[a-zA-Z_]/us", $word ) ) {
-                       switch ( $case ) {
-                               case 'genitive': # родительный падеж
-                                       if ( join( '', array_slice( $chars[0], -1 ) ) === 'ь' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -1 ) ) . 'я';
-                                       } elseif ( join( '', array_slice( $chars[0], -2 ) ) === 'ия' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -2 ) ) . 'ии';
-                                       } elseif ( join( '', array_slice( $chars[0], -2 ) ) === 'ка' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -2 ) ) . 'ки';
-                                       } elseif ( join( '', array_slice( $chars[0], -2 ) ) === 'ти' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -2 ) ) . 'тей';
-                                       } elseif ( join( '', array_slice( $chars[0], -2 ) ) === 'ды' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -2 ) ) . 'дов';
-                                       } elseif ( join( '', array_slice( $chars[0], -1 ) ) === 'д' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -1 ) ) . 'да';
-                                       } elseif ( join( '', array_slice( $chars[0], -3 ) ) === 'ник' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -3 ) ) . 'ника';
-                                       } elseif ( join( '', array_slice( $chars[0], -3 ) ) === 'ные' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -3 ) ) . 'ных';
-                                       }
-                                       break;
-                               case 'dative': # дательный падеж
-                                       # stub
-                                       break;
-                               case 'accusative': # винительный падеж
-                                       # stub
-                                       break;
-                               case 'instrumental': # творительный падеж
-                                       # stub
-                                       break;
-                               case 'prepositional': # предложный падеж
-                                       if ( join( '', array_slice( $chars[0], -1 ) ) === 'ь' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -1 ) ) . 'е';
-                                       } elseif ( join( '', array_slice( $chars[0], -2 ) ) === 'ия' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -2 ) ) . 'ии';
-                                       } elseif ( join( '', array_slice( $chars[0], -2 ) ) === 'ка' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -2 ) ) . 'ке';
-                                       } elseif ( join( '', array_slice( $chars[0], -2 ) ) === 'ти' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -2 ) ) . 'тях';
-                                       } elseif ( join( '', array_slice( $chars[0], -2 ) ) === 'ды' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -2 ) ) . 'дах';
-                                       } elseif ( join( '', array_slice( $chars[0], -1 ) ) === 'д' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -1 ) ) . 'де';
-                                       } elseif ( join( '', array_slice( $chars[0], -3 ) ) === 'ник' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -3 ) ) . 'нике';
-                                       } elseif ( join( '', array_slice( $chars[0], -3 ) ) === 'ные' ) {
-                                               $word = join( '', array_slice( $chars[0], 0, -3 ) ) . 'ных';
-                                       }
-                                       break;
-                               case 'languagegen': # язык в родительном падеже ("(с) русского")
-                                       $suffix = join( '', array_slice( $chars[0], -3 ) );
-                                       if ( $suffix === 'кий' ) {
-                                               $word = join(
-                                                       '',
-                                                       array_slice( $chars[0], 0, count( $chars[0] ) - 2 )
-                                               ) . 'ого';
-
-                                               break;
-                                       }
-
-                                       if ( in_array( $word, array( 'иврит', 'идиш' ) ) ) {
-                                               $word = $word . 'а';
-
-                                               break;
-                                       }
-
-                                       break;
-                               case 'languageprep': # язык в предложном падеже ("(на) русском")
-                                       $suffix = join( '', array_slice( $chars[0], -3 ) );
-                                       if ( $suffix === 'кий' ) {
-                                               $word = join(
-                                                       '',
-                                                       array_slice( $chars[0], 0, count( $chars[0] ) - 2 )
-                                               ) . 'ом';
-
-                                               break;
-                                       }
-
-                                       if ( in_array( $word, array( 'иврит', 'идиш' ) ) ) {
-                                               $word = $word . 'е';
-
-                                               break;
-                                       }
-
-                                       break;
-                               case 'languageadverb': # наречие с названием языка ("по-русски")
-                                       $suffix = join( '', array_slice( $chars[0], -3 ) );
-                                       if ( $suffix === 'кий' ) {
-                                               $word = 'по-' . join(
-                                                       '',
-                                                       array_slice( $chars[0], 0, count( $chars[0] ) - 1 )
-                                               );
-
-                                               break;
-                                       }
-
-                                       if ( in_array( $word, array( 'иврит', 'идиш' ) ) ) {
-                                               $word = 'на ' . $word . 'е';
-
-                                               break;
-                                       }
-
-                                       // Known particular cases of undeclinable names
-                                       // Известные несклоняемые
-                                       if ( in_array( $word, array( 'идо', 'урду', 'хинди', 'эсперанто' ) ) ) {
-                                               $word = "на $word";
+               if ( array_key_exists( $case, $grammarData ) ) {
+                       foreach ( array_keys( $grammarData[$case] ) as $form ) {
+                               if ( $form === '@metadata' ) {
+                                       continue;
+                               }
 
-                                               break;
-                                       }
+                               $regex = "/$form/";
 
-                                       // Undeclinable
-                                       // Остальные несклоняемые
-                                       $word = "на языке $word";
+                               if ( preg_match( $regex, $word ) ) {
+                                       $word = preg_replace( $regex, $grammarData[$case][$form], $word );
 
                                        break;
+                               }
                        }
                }
 
diff --git a/languages/classes/data/grammar.ru.json b/languages/classes/data/grammar.ru.json
new file mode 100644 (file)
index 0000000..446163b
--- /dev/null
@@ -0,0 +1,51 @@
+{
+       "@metadata": {
+               "authors": [
+                       "Alexander Sigachov (alexander.sigachov at Googgle Mail)",
+                       "Amir E. Aharoni (amir.aharoni@mail.huji.ac.il)"
+               ],
+               "comment": "These rules don't cover the whole grammar of the language, and are intended only for names of languages and Wikimedia projects."
+       },
+       "genitive": {
+               "(.+)ь$": "$1я",
+               "(.+)ия$": "$1ии",
+               "(.+)ка$": "$1ки",
+               "(.+)ти$": "$1тей",
+               "(.+)ды$": "$1дов",
+               "(.+)д$": "$1да",
+               "(.+)ник$": "$1ника",
+               "(.+)ные$": "$1ных"
+       },
+       "prepositional": {
+               "(.+)ь$": "$1е",
+               "(.+)ия$": "$1ии",
+               "(.+)ка$": "$1ке",
+               "(.+)ти$": "$1тях",
+               "(.+)ды$": "$1дах",
+               "(.+)д$": "$1де",
+               "(.+)ник$": "$1нике",
+               "(.+)ные$": "$1ных"
+       },
+       "languagegen": {
+               "@metadata": "язык в родительном падеже: '(с) русского'",
+               "(.+)кий$": "$1кого",
+               "иврит$": "иврита",
+               "идиш$": "идиша",
+               "(.+)$": "$1"
+       },
+       "languageprep": {
+               "@metadata": "язык в предложном падеже: '(на) русском'",
+               "(.+)кий$": "$1ком",
+               "иврит$": "иврите",
+               "идиш$": "идише",
+               "(.+)$": "$1"
+       },
+       "languageadverb": {
+               "@metadata": "наречие с названием языка: 'по-русски'",
+               "(.+)кий$": "по-$1ки",
+               "иврит$": "на иврите",
+               "идиш$": "на идише",
+               "(идо|урду|хинди|эсперанто)$": "на $1",
+               "(.+)$": "на языке $1"
+       }
+}