From 638801374b45cf1185e299b5206f7001a22d475d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= Date: Sun, 20 Jul 2014 20:34:34 +0200 Subject: [PATCH] Collation: Workaround for incorrect collation of Estonian MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit 'W' and 'V' should not be considered the same letter for the purposes of collation in modern Estonian. We work around this by replacing 'W' and 'w' with 'ᴡ' U+1D21 'LATIN LETTER SMALL CAPITAL W' for sortkey generation, which is collated like 'W' and is not tailored to have the same primary weight as 'V' in Estonian. Bug: 54168 Change-Id: I3e8031b9d1dc18fdb7595fb9fd23e5f930106fa4 --- includes/AutoLoader.php | 1 + includes/Collation.php | 43 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index 127f2cd4d3..9b1dfc67ef 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -45,6 +45,7 @@ $wgAutoloadLocalClasses = array( 'ChannelFeed' => 'includes/Feed.php', 'Collation' => 'includes/Collation.php', 'CollationCkb' => 'includes/Collation.php', + 'CollationEt' => 'includes/Collation.php', 'ConcatenatedGzipHistoryBlob' => 'includes/HistoryBlob.php', 'Cookie' => 'includes/Cookie.php', 'CookieJar' => 'includes/Cookie.php', diff --git a/includes/Collation.php b/includes/Collation.php index 71adb096cb..bcb0a5682a 100644 --- a/includes/Collation.php +++ b/includes/Collation.php @@ -49,6 +49,8 @@ abstract class Collation { return new IcuCollation( 'root' ); case 'xx-uca-ckb': return new CollationCkb; + case 'xx-uca-et': + return new CollationEt; default: $match = array(); if ( preg_match( '/^uca-([a-z@=-]+)$/', $collationName, $match ) ) { @@ -253,7 +255,7 @@ class IcuCollation extends Collation { 'el' => array(), 'eo' => array( "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ), 'es' => array( "Ñ" ), - 'et' => array( "Š", "Ž", "Õ", "Ä", "Ö", "Ü" ), + 'et' => array( "Š", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ), // added W for CollationEt (xx-uca-et) 'eu' => array( "Ñ" ), 'fo' => array( "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ), 'fur' => array( "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ), @@ -597,3 +599,42 @@ class CollationCkb extends IcuCollation { $this->digitTransformLanguage = Language::factory( 'ckb' ); } } + +/** + * Workaround for incorrect collation of Estonian language ('et') in ICU (bug 54168). + * + * 'W' and 'V' should not be considered the same letter for the purposes of collation in modern + * Estonian. We work around this by replacing 'W' and 'w' with 'ᴡ' U+1D21 'LATIN LETTER SMALL + * CAPITAL W' for sortkey generation, which is collated like 'W' and is not tailored to have the + * same primary weight as 'V' in Estonian. + */ +class CollationEt extends IcuCollation { + function __construct() { + parent::__construct( 'et' ); + } + + private static function mangle( $string ) { + return str_replace( + array( 'w', 'W' ), + 'ᴡ', // U+1D21 'LATIN LETTER SMALL CAPITAL W' + $string + ); + } + + private static function unmangle( $string ) { + // Casing data is lost… + return str_replace( + 'ᴡ', // U+1D21 'LATIN LETTER SMALL CAPITAL W' + 'W', + $string + ); + } + + function getSortKey( $string ) { + return parent::getSortKey( self::mangle( $string ) ); + } + + function getFirstLetter( $string ) { + return self::unmangle( parent::getFirstLetter( self::mangle( $string ) ) ); + } +} -- 2.20.1