From: Bartosz Dziewoński Date: Sun, 20 Jul 2014 18:34:34 +0000 (+0200) Subject: Collation: Workaround for incorrect collation of Estonian X-Git-Tag: 1.31.0-rc.0~14855^2 X-Git-Url: http://git.cyclocoop.org/%7B%24admin_url%7Dcompta/comptes/journal.php?a=commitdiff_plain;h=638801374b45cf1185e299b5206f7001a22d475d;p=lhc%2Fweb%2Fwiklou.git Collation: Workaround for incorrect collation of Estonian 'W' and 'V' should not be considered the same letter for the purposes of collation in modern Estonian. We work around this by replacing 'W' and 'w' with 'ᴡ' U+1D21 'LATIN LETTER SMALL CAPITAL W' for sortkey generation, which is collated like 'W' and is not tailored to have the same primary weight as 'V' in Estonian. Bug: 54168 Change-Id: I3e8031b9d1dc18fdb7595fb9fd23e5f930106fa4 --- diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index 127f2cd4d3..9b1dfc67ef 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -45,6 +45,7 @@ $wgAutoloadLocalClasses = array( 'ChannelFeed' => 'includes/Feed.php', 'Collation' => 'includes/Collation.php', 'CollationCkb' => 'includes/Collation.php', + 'CollationEt' => 'includes/Collation.php', 'ConcatenatedGzipHistoryBlob' => 'includes/HistoryBlob.php', 'Cookie' => 'includes/Cookie.php', 'CookieJar' => 'includes/Cookie.php', diff --git a/includes/Collation.php b/includes/Collation.php index 71adb096cb..bcb0a5682a 100644 --- a/includes/Collation.php +++ b/includes/Collation.php @@ -49,6 +49,8 @@ abstract class Collation { return new IcuCollation( 'root' ); case 'xx-uca-ckb': return new CollationCkb; + case 'xx-uca-et': + return new CollationEt; default: $match = array(); if ( preg_match( '/^uca-([a-z@=-]+)$/', $collationName, $match ) ) { @@ -253,7 +255,7 @@ class IcuCollation extends Collation { 'el' => array(), 'eo' => array( "Ĉ", "Ĝ", "Ĥ", "Ä´", "Ŝ", "Ŭ" ), 'es' => array( "Ñ" ), - 'et' => array( "Å ", "Ž", "Õ", "Ä", "Ö", "Ü" ), + 'et' => array( "Å ", "Ž", "Õ", "Ä", "Ö", "Ü", "W" ), // added W for CollationEt (xx-uca-et) 'eu' => array( "Ñ" ), 'fo' => array( "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ), 'fur' => array( "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ), @@ -597,3 +599,42 @@ class CollationCkb extends IcuCollation { $this->digitTransformLanguage = Language::factory( 'ckb' ); } } + +/** + * Workaround for incorrect collation of Estonian language ('et') in ICU (bug 54168). + * + * 'W' and 'V' should not be considered the same letter for the purposes of collation in modern + * Estonian. We work around this by replacing 'W' and 'w' with 'á´¡' U+1D21 'LATIN LETTER SMALL + * CAPITAL W' for sortkey generation, which is collated like 'W' and is not tailored to have the + * same primary weight as 'V' in Estonian. + */ +class CollationEt extends IcuCollation { + function __construct() { + parent::__construct( 'et' ); + } + + private static function mangle( $string ) { + return str_replace( + array( 'w', 'W' ), + 'á´¡', // U+1D21 'LATIN LETTER SMALL CAPITAL W' + $string + ); + } + + private static function unmangle( $string ) { + // Casing data is lost… + return str_replace( + 'á´¡', // U+1D21 'LATIN LETTER SMALL CAPITAL W' + 'W', + $string + ); + } + + function getSortKey( $string ) { + return parent::getSortKey( self::mangle( $string ) ); + } + + function getFirstLetter( $string ) { + return self::unmangle( parent::getFirstLetter( self::mangle( $string ) ) ); + } +}