From: Brion Vibber Date: Sun, 27 Feb 2005 04:11:41 +0000 (+0000) Subject: * (bug 65) Fix broken interwiki link encoding on Latin-1 wikis; force to UTF-8 X-Git-Tag: 1.5.0alpha1~692 X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/exercices/journal.php?a=commitdiff_plain;h=f2543cafd7fc7dcbb68713b31c0d7658ca07dd5c;p=lhc%2Fweb%2Fwiklou.git * (bug 65) Fix broken interwiki link encoding on Latin-1 wikis; force to UTF-8 --- diff --git a/includes/Title.php b/includes/Title.php index a9163a2fe9..850ad4e49a 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -1225,6 +1225,14 @@ class Title { return false; } + if( $wgUseLatin1 && $this->mInterwiki != '' ) { + # On a Latin-1 wiki, numbered character entities may have + # left us with a mix of 8-bit and UTF-8 characters, and + # some of those might be Windows-1252 special chars. + # Normalize interwikis to pure UTF-8. + $t = Title::mergeLatin1Utf8( $t ); + } + # Fill fields $this->mDbkeyform = $t; $this->mUrlform = wfUrlencode( $t ); @@ -1908,5 +1916,84 @@ class Title { && $this->getDbkey() == $title->getDbkey(); } + /** + * Convert Windows-1252 extended codepoints to their real Unicode points. + * @param int $codepoint + * @return int + * @access private + */ + function cp1252toUnicode( $codepoint ) { + # Mappings from: + # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT + static $cp1252 = array( + 0x80 => 0x20AC, #EURO SIGN + 0x81 => UNICODE_REPLACEMENT, + 0x82 => 0x201A, #SINGLE LOW-9 QUOTATION MARK + 0x83 => 0x0192, #LATIN SMALL LETTER F WITH HOOK + 0x84 => 0x201E, #DOUBLE LOW-9 QUOTATION MARK + 0x85 => 0x2026, #HORIZONTAL ELLIPSIS + 0x86 => 0x2020, #DAGGER + 0x87 => 0x2021, #DOUBLE DAGGER + 0x88 => 0x02C6, #MODIFIER LETTER CIRCUMFLEX ACCENT + 0x89 => 0x2030, #PER MILLE SIGN + 0x8A => 0x0160, #LATIN CAPITAL LETTER S WITH CARON + 0x8B => 0x2039, #SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x8C => 0x0152, #LATIN CAPITAL LIGATURE OE + 0x8D => UNICODE_REPLACEMENT, + 0x8E => 0x017D, #LATIN CAPITAL LETTER Z WITH CARON + 0x8F => UNICODE_REPLACEMENT, + 0x90 => UNICODE_REPLACEMENT, + 0x91 => 0x2018, #LEFT SINGLE QUOTATION MARK + 0x92 => 0x2019, #RIGHT SINGLE QUOTATION MARK + 0x93 => 0x201C, #LEFT DOUBLE QUOTATION MARK + 0x94 => 0x201D, #RIGHT DOUBLE QUOTATION MARK + 0x95 => 0x2022, #BULLET + 0x96 => 0x2013, #EN DASH + 0x97 => 0x2014, #EM DASH + 0x98 => 0x02DC, #SMALL TILDE + 0x99 => 0x2122, #TRADE MARK SIGN + 0x9A => 0x0161, #LATIN SMALL LETTER S WITH CARON + 0x9B => 0x203A, #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x9C => 0x0153, #LATIN SMALL LIGATURE OE + 0x9D => UNICODE_REPLACEMENT, + 0x9E => 0x017E, #LATIN SMALL LETTER Z WITH CARON + 0x9F => 0x0178, #LATIN CAPITAL LETTER Y WITH DIAERESIS + ); + return isset( $cp1252[$codepoint] ) + ? $cp1252[$codepoint] + : $codepoint; + } + + /** + * HACKHACKHACK + * Take a string containing a mix of CP1252 characters and UTF-8 and try + * to convert it completely to UTF-8. + * + * @param string $string + * @return string + * @access private + */ + function mergeLatin1Utf8( $string ) { + return preg_replace_callback( + # Windows CP1252 extends ISO-8859-1 by putting extra characters + # into the high control chars area. We have to convert these + # to their proper Unicode counterparts. + '/([\x80-\x9f])/u', + create_function( '$matches', + 'return codepointToUtf8( + Title::cp1252toUnicode( + utf8ToCodepoint( $matches[1] ) ) );' ), + preg_replace_callback( + # Up-convert everything from 8-bit to UTF-8, then + # filter the valid-looking UTF-8 back from the + # double-converted form. + '/((?:[\xc0-\xdf][\x80-\xbf] + |[\xe0-\xef][\x80-\xbf]{2} + |[\xf0-\xf7][\x80-\xbf]{3})+)/ux', + create_function( '$matches', + 'return utf8_decode( $matches[1] );' ), + utf8_encode( $string ) ) ); + } + } ?>