* (bug 65) Fix broken interwiki link encoding on Latin-1 wikis; force to UTF-8
authorBrion Vibber <brion@users.mediawiki.org>
Sun, 27 Feb 2005 04:11:41 +0000 (04:11 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Sun, 27 Feb 2005 04:11:41 +0000 (04:11 +0000)
includes/Title.php

index a9163a2..850ad4e 100644 (file)
@@ -1225,6 +1225,14 @@ class Title {
                        return false;
                }
                
+               if( $wgUseLatin1 && $this->mInterwiki != '' ) {
+                       # On a Latin-1 wiki, numbered character entities may have
+                       # left us with a mix of 8-bit and UTF-8 characters, and
+                       # some of those might be Windows-1252 special chars.
+                       # Normalize interwikis to pure UTF-8.
+                       $t = Title::mergeLatin1Utf8( $t );
+               }
+
                # Fill fields
                $this->mDbkeyform = $t;
                $this->mUrlform = wfUrlencode( $t );
@@ -1908,5 +1916,84 @@ class Title {
                        && $this->getDbkey() == $title->getDbkey();
        }
 
+       /**
+        * Convert Windows-1252 extended codepoints to their real Unicode points.
+        * @param int $codepoint
+        * @return int
+        * @access private
+        */
+       function cp1252toUnicode( $codepoint ) {
+               # Mappings from:
+               # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
+               static $cp1252 = array(
+                       0x80 => 0x20AC, #EURO SIGN
+                       0x81 => UNICODE_REPLACEMENT,
+                       0x82 => 0x201A, #SINGLE LOW-9 QUOTATION MARK
+                       0x83 => 0x0192, #LATIN SMALL LETTER F WITH HOOK
+                       0x84 => 0x201E, #DOUBLE LOW-9 QUOTATION MARK
+                       0x85 => 0x2026, #HORIZONTAL ELLIPSIS
+                       0x86 => 0x2020, #DAGGER
+                       0x87 => 0x2021, #DOUBLE DAGGER
+                       0x88 => 0x02C6, #MODIFIER LETTER CIRCUMFLEX ACCENT
+                       0x89 => 0x2030, #PER MILLE SIGN
+                       0x8A => 0x0160, #LATIN CAPITAL LETTER S WITH CARON
+                       0x8B => 0x2039, #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+                       0x8C => 0x0152, #LATIN CAPITAL LIGATURE OE
+                       0x8D => UNICODE_REPLACEMENT,
+                       0x8E => 0x017D, #LATIN CAPITAL LETTER Z WITH CARON
+                       0x8F => UNICODE_REPLACEMENT,
+                       0x90 => UNICODE_REPLACEMENT,
+                       0x91 => 0x2018, #LEFT SINGLE QUOTATION MARK
+                       0x92 => 0x2019, #RIGHT SINGLE QUOTATION MARK
+                       0x93 => 0x201C, #LEFT DOUBLE QUOTATION MARK
+                       0x94 => 0x201D, #RIGHT DOUBLE QUOTATION MARK
+                       0x95 => 0x2022, #BULLET
+                       0x96 => 0x2013, #EN DASH
+                       0x97 => 0x2014, #EM DASH
+                       0x98 => 0x02DC, #SMALL TILDE
+                       0x99 => 0x2122, #TRADE MARK SIGN
+                       0x9A => 0x0161, #LATIN SMALL LETTER S WITH CARON
+                       0x9B => 0x203A, #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+                       0x9C => 0x0153, #LATIN SMALL LIGATURE OE
+                       0x9D => UNICODE_REPLACEMENT,
+                       0x9E => 0x017E, #LATIN SMALL LETTER Z WITH CARON
+                       0x9F => 0x0178, #LATIN CAPITAL LETTER Y WITH DIAERESIS
+                       );
+               return isset( $cp1252[$codepoint] )
+                       ? $cp1252[$codepoint]
+                       : $codepoint;
+       }
+       
+       /**
+        * HACKHACKHACK
+        * Take a string containing a mix of CP1252 characters and UTF-8 and try
+        * to convert it completely to UTF-8.
+        *
+        * @param string $string
+        * @return string
+        * @access private
+        */
+       function mergeLatin1Utf8( $string ) {
+               return preg_replace_callback(
+                       # Windows CP1252 extends ISO-8859-1 by putting extra characters
+                       # into the high control chars area. We have to convert these
+                       # to their proper Unicode counterparts.
+                       '/([\x80-\x9f])/u',
+                       create_function( '$matches',
+                               'return codepointToUtf8(
+                                       Title::cp1252toUnicode(
+                                               utf8ToCodepoint( $matches[1] ) ) );' ),
+                       preg_replace_callback(
+                               # Up-convert everything from 8-bit to UTF-8, then
+                               # filter the valid-looking UTF-8 back from the
+                               # double-converted form.
+                               '/((?:[\xc0-\xdf][\x80-\xbf]
+                                        |[\xe0-\xef][\x80-\xbf]{2}
+                                        |[\xf0-\xf7][\x80-\xbf]{3})+)/ux',
+                               create_function( '$matches',
+                                       'return utf8_decode( $matches[1] );' ),
+                               utf8_encode( $string ) ) );
+       }
+
 }
 ?>