From 464d9c071bc265a120e2e8716ca29ce08f028fa6 Mon Sep 17 00:00:00 2001 From: Antoine Musso Date: Wed, 16 Nov 2011 15:12:00 +0000 Subject: [PATCH] bug 28643 improvement to serbian variants conversion This patch is a PARTIAL merge of /branches/nikola/phase3 :: r85224 avoid double conversion when text already use the correct variant r85239 minor fixes to previous r85308 documentation (@since 1.18 update to 1.19) r101359 guessVariant doc + boolean typecast r101369 tests r103131 additional test Test plan: ========== $ ./phpunit.php --filter LanguageSr PHPUnit 3.6.3 by Sebastian Bergmann. Configuration read from /srv/trunk/tests/phpunit/suite.xml ..... Time: 1 second, Memory: 78.50Mb OK (5 tests, 19 assertions) $ --- RELEASE-NOTES-1.19 | 1 + languages/LanguageConverter.php | 20 ++- languages/classes/LanguageSr.php | 26 ++++ tests/phpunit/languages/LanguageSrTest.php | 165 +++++++++++++++++++++ 4 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 tests/phpunit/languages/LanguageSrTest.php diff --git a/RELEASE-NOTES-1.19 b/RELEASE-NOTES-1.19 index c4a2cc043f..9a9a640ee4 100644 --- a/RELEASE-NOTES-1.19 +++ b/RELEASE-NOTES-1.19 @@ -166,6 +166,7 @@ changes to languages because of Bugzilla reports. * (bug 30217) Make pt-br a fallback of pt. * (bug 31193) Set fallback language of Assamese from Bengali to English. * Update date format for dsb and hsb: month names need the genitive. +* (bug 28643) Serbian variant conversion improvements (Nikola Smolenski) === Other changes in 1.19 === * jquery.mwPrototypes module was renamed to jquery.mwExtension. diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index 64cb4beeb9..98ecefad3b 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -322,6 +322,10 @@ class LanguageConverter { } } + if( $this->guessVariant( $text, $toVariant ) ) { + return $text; + } + /* we convert everything except: 1. HTML markups (anything between < and >) 2. HTML entities @@ -571,7 +575,7 @@ class LanguageConverter { */ public function convertTo( $text, $variant ) { global $wgDisableLangConversion; - if ( $wgDisableLangConversion ) { + if ( $wgDisableLangConversion || $this->guessVariant( $text, $variant ) ) { return $text; } return $this->recursiveConvertTopLevel( $text, $variant ); @@ -772,6 +776,20 @@ class LanguageConverter { return '!' . $variant; } + /** + * Guess if a text is written in a variant. This should be implemented in subclasses. + * + * @param string $text the text to be checked + * @param string $variant language code of the variant to be checked for + * @return bool true if $text appears to be written in $variant, false if not + * + * @author Nikola Smolenski + * @since 1.19 + */ + public function guessVariant($text, $variant) { + return false; + } + /** * Load default conversion tables. * This method must be implemented in derived class. diff --git a/languages/classes/LanguageSr.php b/languages/classes/LanguageSr.php index b26a254389..2a083df0ab 100644 --- a/languages/classes/LanguageSr.php +++ b/languages/classes/LanguageSr.php @@ -173,6 +173,32 @@ class SrConverter extends LanguageConverter { return $ret; } + + /** + * Guess if a text is written in Cyrillic or Latin. + * Overrides LanguageConverter::guessVariant() + * + * @param string $text The text to be checked + * @param string $variant Language code of the variant to be checked for + * @return bool true if $text appears to be written in $variant + * + * @author Nikola Smolenski + * @since 1.19 + */ + public function guessVariant( $text, $variant ) { + $numCyrillic = preg_match_all("/[шђчћжШЂЧЋЖ]/u", $text, $dummy); + $numLatin = preg_match_all("/[šđč枊ĐČĆŽ]/u", $text, $dummy); + + if( $variant == 'sr-ec' ) { + return (boolean) ($numCyrillic > $numLatin); + } else if( $variant == 'sr-el' ) { + return (boolean) ($numLatin > $numCyrillic); + } else { + return false; + } + + } + } /** diff --git a/tests/phpunit/languages/LanguageSrTest.php b/tests/phpunit/languages/LanguageSrTest.php new file mode 100644 index 0000000000..e58c51638c --- /dev/null +++ b/tests/phpunit/languages/LanguageSrTest.php @@ -0,0 +1,165 @@ + + * @copyright Copyright © 2011, Antoine Musso + * @file + */ + +require_once dirname(dirname(__FILE__)). '/bootstrap.php'; + +/** Tests for MediaWiki languages/LanguageTr.php */ +class LanguageSrTest extends MediaWikiTestCase { + /* Language object. Initialized before each test */ + private $lang; + + function setUp() { + $this->lang = Language::factory( 'Sr' ); + } + function tearDown() { + unset( $this->lang ); + } + + ##### TESTS ####################################################### + + function testEasyConversions( ) { + $this->assertCyrillic( + 'шђчћжШЂЧЋЖ', + 'Cyrillic guessing characters' + ); + $this->assertLatin( + 'šđč枊ĐČĆŽ', + 'Latin guessing characters' + ); + } + + function testMixedConversions() { + $this->assertCyrillic( + 'шђчћжШЂЧЋЖ - šđčćž', + 'Mostly cyrillic characters' + ); + $this->assertLatin( + 'šđč枊ĐČĆŽ - шђчћж', + 'Mostly latin characters' + ); + } + + function testSameAmountOfLatinAndCyrillicGetConverted() { + $this->assertConverted( + '4 latin: šđčć | 4 cyrillic: шђчћ', + 'sr-ec' + ); + $this->assertConverted( + '4 latin: šđčć | 4 cyrillic: шђчћ', + 'sr-el' + ); + } + + /** + * @author Nikola Smolenski + */ + function testConversionToCyrillic() { + $this->assertEquals( 'абвг', + $this->convertToCyrillic( 'abvg' ) + ); + $this->assertEquals( 'абвг', + $this->convertToCyrillic( 'абвг' ) + ); + $this->assertEquals( 'abvgшђжчћ', + $this->convertToCyrillic( 'abvgшђжчћ' ) + ); + $this->assertEquals( 'абвгшђжчћ', + $this->convertToCyrillic( 'абвгšđžčć' ) + ); + //Roman numerals are not converted + $this->assertEquals( 'а I б II в III г IV шђжчћ', + $this->convertToCyrillic( 'a I b II v III g IV šđžčć' ) + ); + } + + function testConversionToLatin() { + $this->assertEquals( 'abcd', + $this->convertToLatin( 'abcd' ) + ); + $this->assertEquals( 'abcd', + $this->convertToLatin( 'абцд' ) + ); + $this->assertEquals( 'abcdšđžčć', + $this->convertToLatin( 'abcdшђжчћ' ) + ); + $this->assertEquals( 'абцдšđžčć', + $this->convertToLatin( 'абцдšđžčć' ) + ); + + } + + ##### HELPERS ##################################################### + /** + *Wrapper to verify text stay the same after applying conversion + * @param $text string Text to convert + * @param $variant string Language variant 'sr-ec' or 'sr-el' + * @param $msg string Optional message + */ + function assertUnConverted( $text, $variant, $msg = '' ) { + $this->assertEquals( + $text, + $this->convertTo( $text, $variant ), + $msg + ); + } + /** + * Wrapper to verify a text is different once converted to a variant. + * @param $text string Text to convert + * @param $variant string Language variant 'sr-ec' or 'sr-el' + * @param $msg string Optional message + */ + function assertConverted( $text, $variant, $msg = '' ) { + $this->assertNotEquals( + $text, + $this->convertTo( $text, $variant ), + $msg + ); + } + + /** + * Verifiy the given Cyrillic text is not converted when using + * using the cyrillic variant and converted to Latin when using + * the Latin variant. + */ + function assertCyrillic( $text, $msg = '' ) { + $this->assertUnConverted( $text, 'sr-ec', $msg ); + $this->assertConverted( $text, 'sr-el', $msg ); + } + /** + * Verifiy the given Latin text is not converted when using + * using the Latin variant and converted to Cyrillic when using + * the Cyrillic variant. + */ + function assertLatin( $text, $msg = '' ) { + $this->assertUnConverted( $text, 'sr-el', $msg ); + $this->assertConverted( $text, 'sr-ec', $msg ); + } + + + /** Wrapper for converter::convertTo() method*/ + function convertTo( $text, $variant ) { + return $this + ->lang + ->mConverter + ->convertTo( + $text, $variant + ); + } + function convertToCyrillic( $text ) { + return $this->convertTo( $text, 'sr-ec' ); + } + function convertToLatin( $text ) { + return $this->convertTo( $text, 'sr-el' ); + } +} -- 2.20.1