From: Brian Wolff Date: Fri, 28 Apr 2017 04:52:49 +0000 (+0000) Subject: Add collation for Bashkir (ba) X-Git-Tag: 1.31.0-rc.0~3286^2 X-Git-Url: http://git.cyclocoop.org/%24href?a=commitdiff_plain;h=73f5937047440e8c82aa58e90c150aa9797f5a4b;p=lhc%2Fweb%2Fwiklou.git Add collation for Bashkir (ba) This is based on a numeric uppercase collation. Bashkir characters will be remapped to the private use area for the purpose of sorting. Bug: T162823 Change-Id: I65f1af0b57ff6ded7d464e39efd401f178a3519e --- diff --git a/autoload.php b/autoload.php index e5161f1cb7..36773746b0 100644 --- a/autoload.php +++ b/autoload.php @@ -177,6 +177,7 @@ $wgAutoloadLocalClasses = [ 'BagOStuff' => __DIR__ . '/includes/libs/objectcache/BagOStuff.php', 'BaseDump' => __DIR__ . '/maintenance/backupPrefetch.inc', 'BaseTemplate' => __DIR__ . '/includes/skins/BaseTemplate.php', + 'BashkirUppercaseCollation' => __DIR__ . '/includes/collation/BashkirUppercaseCollation.php', 'BatchRowIterator' => __DIR__ . '/includes/utils/BatchRowIterator.php', 'BatchRowUpdate' => __DIR__ . '/includes/utils/BatchRowUpdate.php', 'BatchRowWriter' => __DIR__ . '/includes/utils/BatchRowWriter.php', @@ -315,6 +316,7 @@ $wgAutoloadLocalClasses = [ 'CssContentHandler' => __DIR__ . '/includes/content/CssContentHandler.php', 'CsvStatsOutput' => __DIR__ . '/maintenance/language/StatOutputs.php', 'CurlHttpRequest' => __DIR__ . '/includes/http/CurlHttpRequest.php', + 'CustomUppercaseCollation' => __DIR__ . '/includes/collation/CustomUppercaseCollation.php', 'DBAccessBase' => __DIR__ . '/includes/dao/DBAccessBase.php', 'DBAccessError' => __DIR__ . '/includes/libs/rdbms/exception/DBAccessError.php', 'DBAccessObjectUtils' => __DIR__ . '/includes/dao/DBAccessObjectUtils.php', diff --git a/includes/collation/BashkirUppercaseCollation.php b/includes/collation/BashkirUppercaseCollation.php new file mode 100644 index 0000000000..33ed9bc8d8 --- /dev/null +++ b/includes/collation/BashkirUppercaseCollation.php @@ -0,0 +1,71 @@ += 64 ) { + throw new UnexpectedValueException( "Alphabet must be < 64 items" ); + } + $this->alphabet = $alphabet; + + $this->puaSubset = []; + $len = count( $alphabet ); + for ( $i = 0; $i < $len; $i++ ) { + $this->puaSubset[] = "\xF3\xB3\x80" . chr( $i + 128 ); + } + parent::__construct( $lang ); + } + + private function convertToPua( $string ) { + return str_replace( $this->alphabet, $this->puaSubset, $string ); + } + + public function getSortKey( $string ) { + return $this->convertToPua( parent::getSortKey( $string ) ); + } + + public function getFirstLetter( $string ) { + // In case a title has a PUA code in it, make it sort + // under the header for the character it would replace + // to avoid inconsistent behaviour. This class mostly + // assumes that people will not use PUA codes. + return parent::getFirstLetter( + str_replace( $this->puaSubset, $this->alphabet, $string ) + ); + } +} diff --git a/tests/phpunit/includes/CollationTest.php b/tests/phpunit/includes/CollationTest.php deleted file mode 100644 index bf283aae73..0000000000 --- a/tests/phpunit/includes/CollationTest.php +++ /dev/null @@ -1,117 +0,0 @@ -checkPHPExtension( 'intl' ); - } - - /** - * Test to make sure, that if you - * have "X" and "XY", the binary - * sortkey also has "X" being a - * prefix of "XY". Our collation - * code makes this assumption. - * - * @param string $lang Language code for collator - * @param string $base Base string - * @param string $extended String containing base as a prefix. - * - * @dataProvider prefixDataProvider - */ - public function testIsPrefix( $lang, $base, $extended ) { - $cp = Collator::create( $lang ); - $cp->setStrength( Collator::PRIMARY ); - $baseBin = $cp->getSortKey( $base ); - // Remove sortkey terminator - $baseBin = rtrim( $baseBin, "\0" ); - $extendedBin = $cp->getSortKey( $extended ); - $this->assertStringStartsWith( $baseBin, $extendedBin, "$base is not a prefix of $extended" ); - } - - public static function prefixDataProvider() { - return [ - [ 'en', 'A', 'AA' ], - [ 'en', 'A', 'AAA' ], - [ 'en', 'Д', 'ДЂ' ], - [ 'en', 'Д', 'ДA' ], - // 'Ʒ' should expand to 'Z ' (note space). - [ 'fi', 'Z', 'Ʒ' ], - // 'Þ' should expand to 'th' - [ 'sv', 't', 'Þ' ], - // Javanese is a limited use alphabet, so should have 3 bytes - // per character, so do some tests with it. - [ 'en', 'ꦲ', 'ꦲꦤ' ], - [ 'en', 'ꦲ', 'ꦲД' ], - [ 'en', 'A', 'Aꦲ' ], - ]; - } - - /** - * Opposite of testIsPrefix - * - * @dataProvider notPrefixDataProvider - */ - public function testNotIsPrefix( $lang, $base, $extended ) { - $cp = Collator::create( $lang ); - $cp->setStrength( Collator::PRIMARY ); - $baseBin = $cp->getSortKey( $base ); - // Remove sortkey terminator - $baseBin = rtrim( $baseBin, "\0" ); - $extendedBin = $cp->getSortKey( $extended ); - $this->assertStringStartsNotWith( $baseBin, $extendedBin, "$base is a prefix of $extended" ); - } - - public static function notPrefixDataProvider() { - return [ - [ 'en', 'A', 'B' ], - [ 'en', 'AC', 'ABC' ], - [ 'en', 'Z', 'Ʒ' ], - [ 'en', 'A', 'ꦲ' ], - ]; - } - - /** - * Test correct first letter is fetched. - * - * @param string $collation Collation name (aka uca-en) - * @param string $string String to get first letter of - * @param string $firstLetter Expected first letter. - * - * @dataProvider firstLetterProvider - */ - public function testGetFirstLetter( $collation, $string, $firstLetter ) { - $col = Collation::factory( $collation ); - $this->assertEquals( $firstLetter, $col->getFirstLetter( $string ) ); - } - - function firstLetterProvider() { - return [ - [ 'uppercase', 'Abc', 'A' ], - [ 'uppercase', 'abc', 'A' ], - [ 'identity', 'abc', 'a' ], - [ 'uca-en', 'abc', 'A' ], - [ 'uca-en', ' ', ' ' ], - [ 'uca-en', 'Êveryone', 'E' ], - [ 'uca-vi', 'Êveryone', 'Ê' ], - // Make sure thorn is not a first letter. - [ 'uca-sv', 'The', 'T' ], - [ 'uca-sv', 'Å', 'Å' ], - [ 'uca-hu', 'dzsdo', 'Dzs' ], - [ 'uca-hu', 'dzdso', 'Dz' ], - [ 'uca-hu', 'CSD', 'Cs' ], - [ 'uca-root', 'CSD', 'C' ], - [ 'uca-fi', 'Ǥ', 'G' ], - [ 'uca-fi', 'Ŧ', 'T' ], - [ 'uca-fi', 'Ʒ', 'Z' ], - [ 'uca-fi', 'Ŋ', 'N' ], - ]; - } -} diff --git a/tests/phpunit/includes/collation/CollationTest.php b/tests/phpunit/includes/collation/CollationTest.php new file mode 100644 index 0000000000..25911a79c1 --- /dev/null +++ b/tests/phpunit/includes/collation/CollationTest.php @@ -0,0 +1,118 @@ +checkPHPExtension( 'intl' ); + } + + /** + * Test to make sure, that if you + * have "X" and "XY", the binary + * sortkey also has "X" being a + * prefix of "XY". Our collation + * code makes this assumption. + * + * @param string $lang Language code for collator + * @param string $base Base string + * @param string $extended String containing base as a prefix. + * + * @dataProvider prefixDataProvider + */ + public function testIsPrefix( $lang, $base, $extended ) { + $cp = Collator::create( $lang ); + $cp->setStrength( Collator::PRIMARY ); + $baseBin = $cp->getSortKey( $base ); + // Remove sortkey terminator + $baseBin = rtrim( $baseBin, "\0" ); + $extendedBin = $cp->getSortKey( $extended ); + $this->assertStringStartsWith( $baseBin, $extendedBin, "$base is not a prefix of $extended" ); + } + + public static function prefixDataProvider() { + return [ + [ 'en', 'A', 'AA' ], + [ 'en', 'A', 'AAA' ], + [ 'en', 'Д', 'ДЂ' ], + [ 'en', 'Д', 'ДA' ], + // 'Ʒ' should expand to 'Z ' (note space). + [ 'fi', 'Z', 'Ʒ' ], + // 'Þ' should expand to 'th' + [ 'sv', 't', 'Þ' ], + // Javanese is a limited use alphabet, so should have 3 bytes + // per character, so do some tests with it. + [ 'en', 'ꦲ', 'ꦲꦤ' ], + [ 'en', 'ꦲ', 'ꦲД' ], + [ 'en', 'A', 'Aꦲ' ], + ]; + } + + /** + * Opposite of testIsPrefix + * + * @dataProvider notPrefixDataProvider + */ + public function testNotIsPrefix( $lang, $base, $extended ) { + $cp = Collator::create( $lang ); + $cp->setStrength( Collator::PRIMARY ); + $baseBin = $cp->getSortKey( $base ); + // Remove sortkey terminator + $baseBin = rtrim( $baseBin, "\0" ); + $extendedBin = $cp->getSortKey( $extended ); + $this->assertStringStartsNotWith( $baseBin, $extendedBin, "$base is a prefix of $extended" ); + } + + public static function notPrefixDataProvider() { + return [ + [ 'en', 'A', 'B' ], + [ 'en', 'AC', 'ABC' ], + [ 'en', 'Z', 'Ʒ' ], + [ 'en', 'A', 'ꦲ' ], + ]; + } + + /** + * Test correct first letter is fetched. + * + * @param string $collation Collation name (aka uca-en) + * @param string $string String to get first letter of + * @param string $firstLetter Expected first letter. + * + * @dataProvider firstLetterProvider + */ + public function testGetFirstLetter( $collation, $string, $firstLetter ) { + $col = Collation::factory( $collation ); + $this->assertEquals( $firstLetter, $col->getFirstLetter( $string ) ); + } + + function firstLetterProvider() { + return [ + [ 'uppercase', 'Abc', 'A' ], + [ 'uppercase', 'abc', 'A' ], + [ 'identity', 'abc', 'a' ], + [ 'uca-en', 'abc', 'A' ], + [ 'uca-en', ' ', ' ' ], + [ 'uca-en', 'Êveryone', 'E' ], + [ 'uca-vi', 'Êveryone', 'Ê' ], + // Make sure thorn is not a first letter. + [ 'uca-sv', 'The', 'T' ], + [ 'uca-sv', 'Å', 'Å' ], + [ 'uca-hu', 'dzsdo', 'Dzs' ], + [ 'uca-hu', 'dzdso', 'Dz' ], + [ 'uca-hu', 'CSD', 'Cs' ], + [ 'uca-root', 'CSD', 'C' ], + [ 'uca-fi', 'Ǥ', 'G' ], + [ 'uca-fi', 'Ŧ', 'T' ], + [ 'uca-fi', 'Ʒ', 'Z' ], + [ 'uca-fi', 'Ŋ', 'N' ], + [ 'uppercase-ba', 'в', 'В' ], + ]; + } +} diff --git a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php new file mode 100644 index 0000000000..5d5317be7b --- /dev/null +++ b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php @@ -0,0 +1,57 @@ +collation = new CustomUppercaseCollation( [ + 'D', + 'C', + 'B' + ], Language::factory( 'en' ) ); + + parent::setUp(); + } + + /** + * @dataProvider providerOrder + */ + public function testOrder( $first, $second, $msg ) { + $sortkey1 = $this->collation->getSortKey( $first ); + $sortkey2 = $this->collation->getSortKey( $second ); + + $this->assertTrue( strcmp( $sortkey1, $sortkey2 ) < 0, $msg ); + } + + public function providerOrder() { + return [ + [ 'X', 'Z', 'Maintain order of unrearranged' ], + [ 'D', 'C', 'Actually resorts' ], + [ 'D', 'B', 'resort test 2' ], + [ 'Adobe', 'Abode', 'not first letter' ], + [ '💩 ', 'C', 'Test relocated to end' ], + [ 'c', 'b', 'lowercase' ], + [ 'x', 'z', 'lowercase original' ], + [ 'C50D', 'C100', 'Numbers' ] + ]; + } + + /** + * @dataProvider provideGetFirstLetter + */ + public function testGetFirstLetter( $string, $first ) { + $this->assertSame( $this->collation->getFirstLetter( $string ), $first ); + } + + public function provideGetFirstLetter() { + return [ + [ 'Do', 'D' ], + [ 'do', 'D' ], + [ 'Ao', 'A' ], + [ 'afdsa', 'A' ], + [ "\xF3\xB3\x80\x80Foo", 'D' ], + [ "\xF3\xB3\x80\x81Foo", 'C' ], + [ "\xF3\xB3\x80\x82Foo", 'B' ], + [ "\xF3\xB3\x80\x83Foo", "\xF3\xB3\x80\x83" ], + ]; + } +}