bug 28643 improvement to serbian variants conversion
authorAntoine Musso <hashar@users.mediawiki.org>
Wed, 16 Nov 2011 15:12:00 +0000 (15:12 +0000)
committerAntoine Musso <hashar@users.mediawiki.org>
Wed, 16 Nov 2011 15:12:00 +0000 (15:12 +0000)
This patch is a PARTIAL merge of /branches/nikola/phase3 ::

r85224 avoid double conversion when text already use the correct variant
r85239 minor fixes to previous
r85308 documentation (@since 1.18 update to 1.19)
r101359 guessVariant doc + boolean typecast
r101369 tests
r103131 additional test

Test plan:
==========
$ ./phpunit.php --filter LanguageSr
PHPUnit 3.6.3 by Sebastian Bergmann.

Configuration read from /srv/trunk/tests/phpunit/suite.xml

.....

Time: 1 second, Memory: 78.50Mb

OK (5 tests, 19 assertions)
$

RELEASE-NOTES-1.19
languages/LanguageConverter.php
languages/classes/LanguageSr.php
tests/phpunit/languages/LanguageSrTest.php [new file with mode: 0644]

index c4a2cc0..9a9a640 100644 (file)
@@ -166,6 +166,7 @@ changes to languages because of Bugzilla reports.
 * (bug 30217) Make pt-br a fallback of pt.
 * (bug 31193) Set fallback language of Assamese from Bengali to English.
 * Update date format for dsb and hsb: month names need the genitive.
+* (bug 28643) Serbian variant conversion improvements (Nikola Smolenski)
 
 === Other changes in 1.19 ===
 * jquery.mwPrototypes module was renamed to jquery.mwExtension.
index 64cb4be..98ecefa 100644 (file)
@@ -322,6 +322,10 @@ class LanguageConverter {
                        }
                }
 
+               if( $this->guessVariant( $text, $toVariant ) ) {
+                       return $text;
+               }
+
                /* we convert everything except:
                   1. HTML markups (anything between < and >)
                   2. HTML entities
@@ -571,7 +575,7 @@ class LanguageConverter {
         */
        public function convertTo( $text, $variant ) {
                global $wgDisableLangConversion;
-               if ( $wgDisableLangConversion ) {
+               if ( $wgDisableLangConversion || $this->guessVariant( $text, $variant ) ) {
                        return $text;
                }
                return $this->recursiveConvertTopLevel( $text, $variant );
@@ -772,6 +776,20 @@ class LanguageConverter {
                return '!' . $variant;
        }
 
+       /**
+        * Guess if a text is written in a variant. This should be implemented in subclasses.
+        *
+        * @param string        $text the text to be checked
+        * @param string        $variant language code of the variant to be checked for
+        * @return bool true if $text appears to be written in $variant, false if not
+        *
+        * @author Nikola Smolenski <smolensk@eunet.rs>
+        * @since 1.19
+        */
+       public function guessVariant($text, $variant) {
+               return false;
+       }
+
        /**
         * Load default conversion tables.
         * This method must be implemented in derived class.
index b26a254..2a083df 100644 (file)
@@ -173,6 +173,32 @@ class SrConverter extends LanguageConverter {
 
                return $ret;
        }
+
+       /**
+        * Guess if a text is written in Cyrillic or Latin.
+        * Overrides LanguageConverter::guessVariant()
+        *
+        * @param string  $text The text to be checked
+        * @param string  $variant Language code of the variant to be checked for
+        * @return bool  true if $text appears to be written in $variant
+        *
+        * @author Nikola Smolenski <smolensk@eunet.rs>
+        * @since 1.19
+        */
+       public function guessVariant( $text, $variant ) {
+               $numCyrillic = preg_match_all("/[шђчћжШЂЧЋЖ]/u", $text, $dummy);
+               $numLatin = preg_match_all("/[šđč枊ĐČĆŽ]/u", $text, $dummy);
+
+               if( $variant == 'sr-ec' ) {
+                       return (boolean) ($numCyrillic > $numLatin);
+               } else if( $variant == 'sr-el' ) {
+                       return (boolean) ($numLatin > $numCyrillic);
+               } else {
+                       return false;
+               }
+
+       }
+
 }
 
 /**
diff --git a/tests/phpunit/languages/LanguageSrTest.php b/tests/phpunit/languages/LanguageSrTest.php
new file mode 100644 (file)
index 0000000..e58c516
--- /dev/null
@@ -0,0 +1,165 @@
+<?php
+/**
+ * PHPUnit tests for the Serbian language.
+ * The language can be represented using two scripts:
+ *  - Latin (SR_el)
+ *  - Cyrillic (SR_ec)
+ * Both representations seems to be bijective, hence MediaWiki can convert
+ * from one script to the other.
+ *
+ * @author Antoine Musso <hashar at free dot fr>
+ * @copyright Copyright © 2011, Antoine Musso <hashar at free dot fr>
+ * @file
+ */
+
+require_once dirname(dirname(__FILE__)). '/bootstrap.php';
+
+/** Tests for MediaWiki languages/LanguageTr.php */
+class LanguageSrTest extends MediaWikiTestCase {
+       /* Language object. Initialized before each test */
+       private $lang;
+
+       function setUp() {
+               $this->lang = Language::factory( 'Sr' );
+       }
+       function tearDown() {
+               unset( $this->lang );
+       }
+
+       ##### TESTS #######################################################
+
+       function testEasyConversions( ) {
+               $this->assertCyrillic(
+                       'шђчћжШЂЧЋЖ',
+                       'Cyrillic guessing characters'
+               );
+               $this->assertLatin(
+                       'šđč枊ĐČĆŽ',
+                       'Latin guessing characters'
+               );
+       }
+
+       function testMixedConversions() {
+               $this->assertCyrillic(
+                       'шђчћжШЂЧЋЖ - šđčćž',
+                       'Mostly cyrillic characters'
+               );
+               $this->assertLatin(
+                       'šđč枊ĐČĆŽ - шђчћж',
+                       'Mostly latin characters'
+               );
+       }
+
+       function testSameAmountOfLatinAndCyrillicGetConverted() {
+               $this->assertConverted(
+                       '4 latin: šđčć | 4 cyrillic: шђчћ',
+                       'sr-ec'
+               );
+               $this->assertConverted(
+                       '4 latin: šđčć | 4 cyrillic: шђчћ',
+                       'sr-el'
+               );
+       }
+
+       /**
+        * @author Nikola Smolenski 
+        */
+       function testConversionToCyrillic() {
+               $this->assertEquals( 'абвг',
+                       $this->convertToCyrillic( 'abvg' )
+               );
+               $this->assertEquals( 'абвг',
+                       $this->convertToCyrillic( 'абвг' )
+               );
+               $this->assertEquals( 'abvgшђжчћ',
+                       $this->convertToCyrillic( 'abvgшђжчћ' )
+               );
+               $this->assertEquals( 'абвгшђжчћ',
+                       $this->convertToCyrillic( 'абвгšđžčć' )
+               );
+               //Roman numerals are not converted
+               $this->assertEquals( 'а I б II в III г IV шђжчћ',
+                       $this->convertToCyrillic( 'a I b II v III g IV šđžčć' )
+               );
+       }
+
+       function testConversionToLatin() {
+               $this->assertEquals( 'abcd',
+                       $this->convertToLatin( 'abcd' )
+               );
+               $this->assertEquals( 'abcd',
+                       $this->convertToLatin( 'абцд' )
+               );
+               $this->assertEquals( 'abcdšđžčć',
+                       $this->convertToLatin( 'abcdшђжчћ' )
+               );
+               $this->assertEquals( 'абцдšđžčć',
+                       $this->convertToLatin( 'абцдšđžčć' )
+               );
+
+       }
+
+       ##### HELPERS #####################################################
+       /**
+        *Wrapper to verify text stay the same after applying conversion
+        * @param $text string Text to convert
+        * @param $variant string Language variant 'sr-ec' or 'sr-el'
+        * @param $msg string Optional message
+        */
+       function assertUnConverted( $text, $variant, $msg = '' ) {
+               $this->assertEquals(
+                       $text,
+                       $this->convertTo( $text, $variant ),
+                       $msg
+               );
+       }
+       /**
+        * Wrapper to verify a text is different once converted to a variant.
+        * @param $text string Text to convert
+        * @param $variant string Language variant 'sr-ec' or 'sr-el'
+        * @param $msg string Optional message
+        */
+       function assertConverted( $text, $variant, $msg = '' ) {
+               $this->assertNotEquals(
+                       $text,
+                       $this->convertTo( $text, $variant ),
+                       $msg
+               );
+       }
+
+       /**
+        * Verifiy the given Cyrillic text is not converted when using
+        * using the cyrillic variant and converted to Latin when using
+        * the Latin variant.
+        */
+       function assertCyrillic( $text, $msg = '' ) {
+               $this->assertUnConverted( $text, 'sr-ec', $msg );
+               $this->assertConverted( $text, 'sr-el', $msg );
+       }
+       /**
+        * Verifiy the given Latin text is not converted when using
+        * using the Latin variant and converted to Cyrillic when using
+        * the Cyrillic variant.
+        */
+       function assertLatin( $text, $msg = '' ) {
+               $this->assertUnConverted( $text, 'sr-el', $msg );
+               $this->assertConverted( $text, 'sr-ec', $msg );
+       }
+
+
+       /** Wrapper for converter::convertTo() method*/
+       function convertTo( $text, $variant ) {
+               return $this
+                       ->lang
+                       ->mConverter
+                       ->convertTo(
+                               $text, $variant
+                       );
+       }
+       function convertToCyrillic( $text ) {
+               return $this->convertTo( $text, 'sr-ec' );
+       }
+       function convertToLatin( $text ) {
+               return $this->convertTo( $text, 'sr-el' );
+       }
+}