From e94587dfbbba780866d61c7c5097a5a5a6f48a22 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= Date: Wed, 20 Dec 2017 23:45:23 +0100 Subject: [PATCH] Add collation for Abkhaz (ab) * Adding new class AbkhazUppercaseCollation, mapped to 'uppercase-ab'. * Extended CustomUppercaseCollation with support for sorting digraphs and for alphabets larger than 64 letters (up to 4096). Bug: T183430 Change-Id: I16d44568e44d7ef5b39c38b1a6257b9fe10a34d4 --- autoload.php | 1 + .../collation/AbkhazUppercaseCollation.php | 93 +++++++++++++++++++ includes/collation/Collation.php | 2 + .../collation/CustomUppercaseCollation.php | 41 +++++--- .../CustomUppercaseCollationTest.php | 12 ++- 5 files changed, 132 insertions(+), 17 deletions(-) create mode 100644 includes/collation/AbkhazUppercaseCollation.php diff --git a/autoload.php b/autoload.php index 6b8387b4f7..47c04b9b22 100644 --- a/autoload.php +++ b/autoload.php @@ -6,6 +6,7 @@ global $wgAutoloadLocalClasses; $wgAutoloadLocalClasses = [ 'APCBagOStuff' => __DIR__ . '/includes/libs/objectcache/APCBagOStuff.php', 'APCUBagOStuff' => __DIR__ . '/includes/libs/objectcache/APCUBagOStuff.php', + 'AbkhazUppercaseCollation' => __DIR__ . '/includes/collation/AbkhazUppercaseCollation.php', 'AbstractContent' => __DIR__ . '/includes/content/AbstractContent.php', 'Action' => __DIR__ . '/includes/actions/Action.php', 'ActiveUsersPager' => __DIR__ . '/includes/specials/pagers/ActiveUsersPager.php', diff --git a/includes/collation/AbkhazUppercaseCollation.php b/includes/collation/AbkhazUppercaseCollation.php new file mode 100644 index 0000000000..e0ea237f34 --- /dev/null +++ b/includes/collation/AbkhazUppercaseCollation.php @@ -0,0 +1,93 @@ += 64 ) { - throw new UnexpectedValueException( "Alphabet must be < 64 items" ); + if ( count( $alphabet ) < 1 || count( $alphabet ) >= 4096 ) { + throw new UnexpectedValueException( "Alphabet must be < 4096 items" ); } - $this->alphabet = $alphabet; + $this->firstLetters = $alphabet; + // For digraphs, only the first letter is capitalized in input + $this->alphabet = array_map( [ $lang, 'uc' ], $alphabet ); $this->puaSubset = []; $len = count( $alphabet ); for ( $i = 0; $i < $len; $i++ ) { - $this->puaSubset[] = "\xF3\xB3\x80" . chr( $i + 128 ); + $this->puaSubset[] = "\xF3\xB3" . chr( floor( $i / 64 ) + 128 ) . chr( ( $i % 64 ) + 128 ); } + + // Sort these arrays so that any trigraphs, digraphs etc. are first + // (and they get replaced first in convertToPua()). + $lengths = array_map( 'mb_strlen', $this->alphabet ); + array_multisort( $lengths, SORT_DESC, $this->firstLetters, $this->alphabet, $this->puaSubset ); + parent::__construct( $lang ); } @@ -76,12 +82,17 @@ class CustomUppercaseCollation extends NumericUppercaseCollation { } public function getFirstLetter( $string ) { - // In case a title has a PUA code in it, make it sort - // under the header for the character it would replace - // to avoid inconsistent behaviour. This class mostly - // assumes that people will not use PUA codes. - return parent::getFirstLetter( - str_replace( $this->puaSubset, $this->alphabet, $string ) - ); + $sortkey = $this->getSortKey( $string ); + + // In case a title begins with a character from our alphabet, return the corresponding + // first-letter. (This also happens if the title has a corresponding PUA code in it, to avoid + // inconsistent behaviour. This class mostly assumes that people will not use PUA codes.) + $index = array_search( substr( $sortkey, 0, 4 ), $this->puaSubset ); + if ( $index !== false ) { + return $this->firstLetters[ $index ]; + } + + // String begins with a character outside of our alphabet, fall back + return parent::getFirstLetter( $string ); } } diff --git a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php index 5d5317be7b..90c097df22 100644 --- a/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php +++ b/tests/phpunit/includes/collation/CustomUppercaseCollationTest.php @@ -6,6 +6,7 @@ class CustomUppercaseCollationTest extends MediaWikiTestCase { $this->collation = new CustomUppercaseCollation( [ 'D', 'C', + 'Cs', 'B' ], Language::factory( 'en' ) ); @@ -31,6 +32,7 @@ class CustomUppercaseCollationTest extends MediaWikiTestCase { [ '💩 ', 'C', 'Test relocated to end' ], [ 'c', 'b', 'lowercase' ], [ 'x', 'z', 'lowercase original' ], + [ 'Cz', 'Cs', 'digraphs' ], [ 'C50D', 'C100', 'Numbers' ] ]; } @@ -50,8 +52,14 @@ class CustomUppercaseCollationTest extends MediaWikiTestCase { [ 'afdsa', 'A' ], [ "\xF3\xB3\x80\x80Foo", 'D' ], [ "\xF3\xB3\x80\x81Foo", 'C' ], - [ "\xF3\xB3\x80\x82Foo", 'B' ], - [ "\xF3\xB3\x80\x83Foo", "\xF3\xB3\x80\x83" ], + [ "\xF3\xB3\x80\x82Foo", 'Cs' ], + [ "\xF3\xB3\x80\x83Foo", 'B' ], + [ "\xF3\xB3\x80\x84Foo", "\xF3\xB3\x80\x84" ], + [ 'C', 'C' ], + [ 'Cz', 'C' ], + [ 'Cs', 'Cs' ], + [ 'CS', 'Cs' ], + [ 'cs', 'Cs' ], ]; } } -- 2.20.1