'BagOStuff' => __DIR__ . '/includes/libs/objectcache/BagOStuff.php',
'BaseDump' => __DIR__ . '/maintenance/backupPrefetch.inc',
'BaseTemplate' => __DIR__ . '/includes/skins/BaseTemplate.php',
+ 'BashkirUppercaseCollation' => __DIR__ . '/includes/collation/BashkirUppercaseCollation.php',
'BatchRowIterator' => __DIR__ . '/includes/utils/BatchRowIterator.php',
'BatchRowUpdate' => __DIR__ . '/includes/utils/BatchRowUpdate.php',
'BatchRowWriter' => __DIR__ . '/includes/utils/BatchRowWriter.php',
'CssContentHandler' => __DIR__ . '/includes/content/CssContentHandler.php',
'CsvStatsOutput' => __DIR__ . '/maintenance/language/StatOutputs.php',
'CurlHttpRequest' => __DIR__ . '/includes/http/CurlHttpRequest.php',
+ 'CustomUppercaseCollation' => __DIR__ . '/includes/collation/CustomUppercaseCollation.php',
'DBAccessBase' => __DIR__ . '/includes/dao/DBAccessBase.php',
'DBAccessError' => __DIR__ . '/includes/libs/rdbms/exception/DBAccessError.php',
'DBAccessObjectUtils' => __DIR__ . '/includes/dao/DBAccessObjectUtils.php',
--- /dev/null
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.30
+ *
+ * @file
+ */
+
+class BashkirUppercaseCollation extends CustomUppercaseCollation {
+
+ public function __construct() {
+ parent::__construct( [
+ 'А',
+ 'Б',
+ 'В',
+ 'Г',
+ 'Ғ',
+ 'Д',
+ 'Ҙ',
+ 'Е',
+ 'Ё',
+ 'Ж',
+ 'З',
+ 'И',
+ 'Й',
+ 'К',
+ 'Ҡ',
+ 'Л',
+ 'М',
+ 'Н',
+ 'Ң',
+ 'О',
+ 'Ө',
+ 'П',
+ 'Р',
+ 'С',
+ 'Ҫ',
+ 'Т',
+ 'У',
+ 'Ү',
+ 'Ф',
+ 'Х',
+ 'Һ',
+ 'Ц',
+ 'Ч',
+ 'Ш',
+ 'Щ',
+ 'Ъ',
+ 'Ы',
+ 'Ь',
+ 'Э',
+ 'Ә',
+ 'Ю',
+ 'Я',
+ ], Language::factory( 'ba' ) );
+ }
+}
return new CollationEt;
case 'xx-uca-fa':
return new CollationFa;
+ case 'uppercase-ba':
+ return new BashkirUppercaseCollation;
default:
$match = [];
if ( preg_match( '/^uca-([A-Za-z@=-]+)$/', $collationName, $match ) ) {
--- /dev/null
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.30
+ *
+ * @file
+ */
+
+/**
+ * Resort normal UTF-8 order by putting a bunch of stuff in PUA
+ *
+ * This takes a bunch of characters (The alphabet) that should,
+ * be together, and converts them all to private-use-area characters
+ * so that they are all sorted in the right order relative to each
+ * other.
+ *
+ * This renumbers characters starting at U+F3000 (Chosen to avoid
+ * conflicts with other people using private use area)
+ *
+ * This does not support fancy things like secondary differences, etc.
+ *
+ * It is expected most people will subclass this and just override the
+ * constructor to hard-code an alphabet.
+ */
+class CustomUppercaseCollation extends NumericUppercaseCollation {
+
+ /** @var array $alphabet Sorted array of letters */
+ private $alphabet;
+
+ /** @var array $puaSubset List of private use area codes */
+ private $puaSubset;
+
+ /**
+ * @note This assumes $alphabet does not contain U+F3000-U+F303F
+ *
+ * @param array $alphabet Sorted array of uppercase characters.
+ * @param Language $language What language for number sorting.
+ */
+ public function __construct( array $alphabet, Language $lang ) {
+ // It'd be trivial to extend this past 64, you'd just
+ // need a bit of bit-fiddling. Doesn't seem necessary right
+ // now.
+ if ( count( $alphabet ) < 1 || count( $alphabet ) >= 64 ) {
+ throw new UnexpectedValueException( "Alphabet must be < 64 items" );
+ }
+ $this->alphabet = $alphabet;
+
+ $this->puaSubset = [];
+ $len = count( $alphabet );
+ for ( $i = 0; $i < $len; $i++ ) {
+ $this->puaSubset[] = "\xF3\xB3\x80" . chr( $i + 128 );
+ }
+ parent::__construct( $lang );
+ }
+
+ private function convertToPua( $string ) {
+ return str_replace( $this->alphabet, $this->puaSubset, $string );
+ }
+
+ public function getSortKey( $string ) {
+ return $this->convertToPua( parent::getSortKey( $string ) );
+ }
+
+ public function getFirstLetter( $string ) {
+ // In case a title has a PUA code in it, make it sort
+ // under the header for the character it would replace
+ // to avoid inconsistent behaviour. This class mostly
+ // assumes that people will not use PUA codes.
+ return parent::getFirstLetter(
+ str_replace( $this->puaSubset, $this->alphabet, $string )
+ );
+ }
+}
+++ /dev/null
-<?php
-
-/**
- * Class CollationTest
- * @covers Collation
- * @covers IcuCollation
- * @covers IdentityCollation
- * @covers UppercaseCollation
- */
-class CollationTest extends MediaWikiLangTestCase {
- protected function setUp() {
- parent::setUp();
- $this->checkPHPExtension( 'intl' );
- }
-
- /**
- * Test to make sure, that if you
- * have "X" and "XY", the binary
- * sortkey also has "X" being a
- * prefix of "XY". Our collation
- * code makes this assumption.
- *
- * @param string $lang Language code for collator
- * @param string $base Base string
- * @param string $extended String containing base as a prefix.
- *
- * @dataProvider prefixDataProvider
- */
- public function testIsPrefix( $lang, $base, $extended ) {
- $cp = Collator::create( $lang );
- $cp->setStrength( Collator::PRIMARY );
- $baseBin = $cp->getSortKey( $base );
- // Remove sortkey terminator
- $baseBin = rtrim( $baseBin, "\0" );
- $extendedBin = $cp->getSortKey( $extended );
- $this->assertStringStartsWith( $baseBin, $extendedBin, "$base is not a prefix of $extended" );
- }
-
- public static function prefixDataProvider() {
- return [
- [ 'en', 'A', 'AA' ],
- [ 'en', 'A', 'AAA' ],
- [ 'en', 'Д', 'ДЂ' ],
- [ 'en', 'Д', 'ДA' ],
- // 'Ʒ' should expand to 'Z ' (note space).
- [ 'fi', 'Z', 'Ʒ' ],
- // 'Þ' should expand to 'th'
- [ 'sv', 't', 'Þ' ],
- // Javanese is a limited use alphabet, so should have 3 bytes
- // per character, so do some tests with it.
- [ 'en', 'ꦲ', 'ꦲꦤ' ],
- [ 'en', 'ꦲ', 'ꦲД' ],
- [ 'en', 'A', 'Aꦲ' ],
- ];
- }
-
- /**
- * Opposite of testIsPrefix
- *
- * @dataProvider notPrefixDataProvider
- */
- public function testNotIsPrefix( $lang, $base, $extended ) {
- $cp = Collator::create( $lang );
- $cp->setStrength( Collator::PRIMARY );
- $baseBin = $cp->getSortKey( $base );
- // Remove sortkey terminator
- $baseBin = rtrim( $baseBin, "\0" );
- $extendedBin = $cp->getSortKey( $extended );
- $this->assertStringStartsNotWith( $baseBin, $extendedBin, "$base is a prefix of $extended" );
- }
-
- public static function notPrefixDataProvider() {
- return [
- [ 'en', 'A', 'B' ],
- [ 'en', 'AC', 'ABC' ],
- [ 'en', 'Z', 'Ʒ' ],
- [ 'en', 'A', 'ꦲ' ],
- ];
- }
-
- /**
- * Test correct first letter is fetched.
- *
- * @param string $collation Collation name (aka uca-en)
- * @param string $string String to get first letter of
- * @param string $firstLetter Expected first letter.
- *
- * @dataProvider firstLetterProvider
- */
- public function testGetFirstLetter( $collation, $string, $firstLetter ) {
- $col = Collation::factory( $collation );
- $this->assertEquals( $firstLetter, $col->getFirstLetter( $string ) );
- }
-
- function firstLetterProvider() {
- return [
- [ 'uppercase', 'Abc', 'A' ],
- [ 'uppercase', 'abc', 'A' ],
- [ 'identity', 'abc', 'a' ],
- [ 'uca-en', 'abc', 'A' ],
- [ 'uca-en', ' ', ' ' ],
- [ 'uca-en', 'Êveryone', 'E' ],
- [ 'uca-vi', 'Êveryone', 'Ê' ],
- // Make sure thorn is not a first letter.
- [ 'uca-sv', 'The', 'T' ],
- [ 'uca-sv', 'Å', 'Å' ],
- [ 'uca-hu', 'dzsdo', 'Dzs' ],
- [ 'uca-hu', 'dzdso', 'Dz' ],
- [ 'uca-hu', 'CSD', 'Cs' ],
- [ 'uca-root', 'CSD', 'C' ],
- [ 'uca-fi', 'Ǥ', 'G' ],
- [ 'uca-fi', 'Ŧ', 'T' ],
- [ 'uca-fi', 'Ʒ', 'Z' ],
- [ 'uca-fi', 'Ŋ', 'N' ],
- ];
- }
-}
--- /dev/null
+<?php
+
+/**
+ * Class CollationTest
+ * @covers Collation
+ * @covers IcuCollation
+ * @covers IdentityCollation
+ * @covers UppercaseCollation
+ */
+class CollationTest extends MediaWikiLangTestCase {
+ protected function setUp() {
+ parent::setUp();
+ $this->checkPHPExtension( 'intl' );
+ }
+
+ /**
+ * Test to make sure, that if you
+ * have "X" and "XY", the binary
+ * sortkey also has "X" being a
+ * prefix of "XY". Our collation
+ * code makes this assumption.
+ *
+ * @param string $lang Language code for collator
+ * @param string $base Base string
+ * @param string $extended String containing base as a prefix.
+ *
+ * @dataProvider prefixDataProvider
+ */
+ public function testIsPrefix( $lang, $base, $extended ) {
+ $cp = Collator::create( $lang );
+ $cp->setStrength( Collator::PRIMARY );
+ $baseBin = $cp->getSortKey( $base );
+ // Remove sortkey terminator
+ $baseBin = rtrim( $baseBin, "\0" );
+ $extendedBin = $cp->getSortKey( $extended );
+ $this->assertStringStartsWith( $baseBin, $extendedBin, "$base is not a prefix of $extended" );
+ }
+
+ public static function prefixDataProvider() {
+ return [
+ [ 'en', 'A', 'AA' ],
+ [ 'en', 'A', 'AAA' ],
+ [ 'en', 'Д', 'ДЂ' ],
+ [ 'en', 'Д', 'ДA' ],
+ // 'Ʒ' should expand to 'Z ' (note space).
+ [ 'fi', 'Z', 'Ʒ' ],
+ // 'Þ' should expand to 'th'
+ [ 'sv', 't', 'Þ' ],
+ // Javanese is a limited use alphabet, so should have 3 bytes
+ // per character, so do some tests with it.
+ [ 'en', 'ꦲ', 'ꦲꦤ' ],
+ [ 'en', 'ꦲ', 'ꦲД' ],
+ [ 'en', 'A', 'Aꦲ' ],
+ ];
+ }
+
+ /**
+ * Opposite of testIsPrefix
+ *
+ * @dataProvider notPrefixDataProvider
+ */
+ public function testNotIsPrefix( $lang, $base, $extended ) {
+ $cp = Collator::create( $lang );
+ $cp->setStrength( Collator::PRIMARY );
+ $baseBin = $cp->getSortKey( $base );
+ // Remove sortkey terminator
+ $baseBin = rtrim( $baseBin, "\0" );
+ $extendedBin = $cp->getSortKey( $extended );
+ $this->assertStringStartsNotWith( $baseBin, $extendedBin, "$base is a prefix of $extended" );
+ }
+
+ public static function notPrefixDataProvider() {
+ return [
+ [ 'en', 'A', 'B' ],
+ [ 'en', 'AC', 'ABC' ],
+ [ 'en', 'Z', 'Ʒ' ],
+ [ 'en', 'A', 'ꦲ' ],
+ ];
+ }
+
+ /**
+ * Test correct first letter is fetched.
+ *
+ * @param string $collation Collation name (aka uca-en)
+ * @param string $string String to get first letter of
+ * @param string $firstLetter Expected first letter.
+ *
+ * @dataProvider firstLetterProvider
+ */
+ public function testGetFirstLetter( $collation, $string, $firstLetter ) {
+ $col = Collation::factory( $collation );
+ $this->assertEquals( $firstLetter, $col->getFirstLetter( $string ) );
+ }
+
+ function firstLetterProvider() {
+ return [
+ [ 'uppercase', 'Abc', 'A' ],
+ [ 'uppercase', 'abc', 'A' ],
+ [ 'identity', 'abc', 'a' ],
+ [ 'uca-en', 'abc', 'A' ],
+ [ 'uca-en', ' ', ' ' ],
+ [ 'uca-en', 'Êveryone', 'E' ],
+ [ 'uca-vi', 'Êveryone', 'Ê' ],
+ // Make sure thorn is not a first letter.
+ [ 'uca-sv', 'The', 'T' ],
+ [ 'uca-sv', 'Å', 'Å' ],
+ [ 'uca-hu', 'dzsdo', 'Dzs' ],
+ [ 'uca-hu', 'dzdso', 'Dz' ],
+ [ 'uca-hu', 'CSD', 'Cs' ],
+ [ 'uca-root', 'CSD', 'C' ],
+ [ 'uca-fi', 'Ǥ', 'G' ],
+ [ 'uca-fi', 'Ŧ', 'T' ],
+ [ 'uca-fi', 'Ʒ', 'Z' ],
+ [ 'uca-fi', 'Ŋ', 'N' ],
+ [ 'uppercase-ba', 'в', 'В' ],
+ ];
+ }
+}
--- /dev/null
+<?php
+
+class CustomUppercaseCollationTest extends MediaWikiTestCase {
+
+ public function setUp() {
+ $this->collation = new CustomUppercaseCollation( [
+ 'D',
+ 'C',
+ 'B'
+ ], Language::factory( 'en' ) );
+
+ parent::setUp();
+ }
+
+ /**
+ * @dataProvider providerOrder
+ */
+ public function testOrder( $first, $second, $msg ) {
+ $sortkey1 = $this->collation->getSortKey( $first );
+ $sortkey2 = $this->collation->getSortKey( $second );
+
+ $this->assertTrue( strcmp( $sortkey1, $sortkey2 ) < 0, $msg );
+ }
+
+ public function providerOrder() {
+ return [
+ [ 'X', 'Z', 'Maintain order of unrearranged' ],
+ [ 'D', 'C', 'Actually resorts' ],
+ [ 'D', 'B', 'resort test 2' ],
+ [ 'Adobe', 'Abode', 'not first letter' ],
+ [ '💩 ', 'C', 'Test relocated to end' ],
+ [ 'c', 'b', 'lowercase' ],
+ [ 'x', 'z', 'lowercase original' ],
+ [ 'C50D', 'C100', 'Numbers' ]
+ ];
+ }
+
+ /**
+ * @dataProvider provideGetFirstLetter
+ */
+ public function testGetFirstLetter( $string, $first ) {
+ $this->assertSame( $this->collation->getFirstLetter( $string ), $first );
+ }
+
+ public function provideGetFirstLetter() {
+ return [
+ [ 'Do', 'D' ],
+ [ 'do', 'D' ],
+ [ 'Ao', 'A' ],
+ [ 'afdsa', 'A' ],
+ [ "\xF3\xB3\x80\x80Foo", 'D' ],
+ [ "\xF3\xB3\x80\x81Foo", 'C' ],
+ [ "\xF3\xB3\x80\x82Foo", 'B' ],
+ [ "\xF3\xB3\x80\x83Foo", "\xF3\xB3\x80\x83" ],
+ ];
+ }
+}