From 70dede013cda779866ec44017b44948b8ace85c8 Mon Sep 17 00:00:00 2001 From: tjones Date: Mon, 26 Feb 2018 13:03:19 -0500 Subject: [PATCH] Fix table loading bug for CRH transliteration MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit In production, the regex and exception tables were not being loaded, resulting in very poor transliteration. The loading has been moved to the contructor, similar to the implementation of the Kazakh transliteration. Also, a bug in the mappings for Ö/ö -> Ё/ё and Ü/ü -> Ю/ю has been fixed. Test cases for specific additional examples have been added. (Though it is worth noting that the regex and exception tables did load properly during unit testing, so the problem wasn't caught there.) Bug: T186727 Change-Id: I6bacee7d9de6f4a870a8a9ef1f04b819ad489c02 --- languages/classes/LanguageCrh.php | 29 ++++++++++++++----- .../languages/classes/LanguageCrhTest.php | 18 +++++++++++- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/languages/classes/LanguageCrh.php b/languages/classes/LanguageCrh.php index f384471dab..d5418b9a6d 100644 --- a/languages/classes/LanguageCrh.php +++ b/languages/classes/LanguageCrh.php @@ -58,6 +58,26 @@ class CrhConverter extends LanguageConverter { const L_F_UC = 'EİÖÜ'; # Crimean Tatar Latin uppercase front vowels const L_F = 'eiöüEİÖÜ'; # Crimean Tatar Latin front vowels + /** + * @param Language $langobj + * @param string $maincode + * @param array $variants + * @param array $variantfallbacks + * @param array $flags + */ + function __construct( $langobj, $maincode, + $variants = [], + $variantfallbacks = [], + $flags = [] ) { + parent::__construct( $langobj, $maincode, + $variants, $variantfallbacks, $flags ); + + // No point delaying this since they're in code. + // Waiting until loadDefaultTables() means they never get loaded + // when the tables themselves are loaded from cache. + $this->loadExceptions(); + } + public $mCyrillicToLatin = [ ## these are independent of location in the word, but have @@ -106,9 +126,8 @@ class CrhConverter extends LanguageConverter { // hack, hack, hack 'A' => 'А', 'a' => 'а', 'E' => 'Е', 'e' => 'е', - 'Ö' => 'О', 'ö' => 'о', 'U' => 'У', 'u' => 'у', - 'Ü' => 'У', 'ü' => 'у', 'Y' => 'Й', 'y' => 'й', - + 'Ö' => 'Ё', 'ö' => 'ё', 'U' => 'У', 'u' => 'у', + 'Ü' => 'Ю', 'ü' => 'ю', 'Y' => 'Й', 'y' => 'й', 'C' => 'Дж', 'c' => 'дж', 'Ğ' => 'Гъ', 'ğ' => 'гъ', 'Ñ' => 'Нъ', 'ñ' => 'нъ', 'Q' => 'Къ', 'q' => 'къ', @@ -129,10 +148,6 @@ class CrhConverter extends LanguageConverter { ]; } - function postLoadTables() { - $this->loadExceptions(); - } - function loadExceptions() { if ( $this->mExceptionsLoaded ) { return; diff --git a/tests/phpunit/languages/classes/LanguageCrhTest.php b/tests/phpunit/languages/classes/LanguageCrhTest.php index d99fc26729..7c99614e61 100644 --- a/tests/phpunit/languages/classes/LanguageCrhTest.php +++ b/tests/phpunit/languages/classes/LanguageCrhTest.php @@ -55,6 +55,22 @@ class LanguageCrhTest extends LanguageClassesTestCase { ], 'инструменталь instrumental гургуль gürgül тюшюнмемек tüşünmemek' ], + [ // recent problem words, part 1 + [ + 'crh' => 'künü куню sürgünligi сюргюнлиги özü озю etti этти', + 'crh-cyrl' => 'куню куню сюргюнлиги сюргюнлиги озю озю этти этти', + 'crh-latn' => 'künü künü sürgünligi sürgünligi özü özü etti etti', + ], + 'künü куню sürgünligi сюргюнлиги özü озю etti этти' + ], + [ // recent problem words, part 2 + [ + 'crh' => 'esas эсас dört дёрт keldi кельди', + 'crh-cyrl' => 'эсас эсас дёрт дёрт кельди кельди', + 'crh-latn' => 'esas esas dört dört keldi keldi', + ], + 'esas эсас dört дёрт keldi кельди' + ], [ // multi part words [ 'crh' => 'эки юз eki yüz', @@ -63,7 +79,7 @@ class LanguageCrhTest extends LanguageClassesTestCase { ], 'эки юз eki yüz' ], - [ // ALL CAPS, made up acronyms + [ // ALL CAPS, made up acronyms (not 100% sure these are correct) [ 'crh' => 'ÑAB QIC ĞUK COT НЪАБ КЪЫДж ГЪУК ДЖОТ CA ДЖА', 'crh-cyrl' => 'НЪАБ КЪЫДж ГЪУК ДЖОТ НЪАБ КЪЫДж ГЪУК ДЖОТ ДЖА ДЖА', -- 2.20.1