Start removing the Latin-1 hacks. We're going pure UTF-8 for 1.5...
[lhc/web/wiklou.git] / includes / Title.php
index fe03907..e6454a3 100644 (file)
@@ -1113,8 +1113,7 @@ class Title {
                        return false;
                }
                
-               global $wgUseLatin1;
-               if( !$wgUseLatin1 && false !== strpos( $t, UTF8_REPLACEMENT ) ) {
+               if( false !== strpos( $t, UTF8_REPLACEMENT ) ) {
                        # Contained illegal UTF-8 sequences or forbidden Unicode chars.
                        wfProfileOut( $fname );
                        return false;
@@ -1244,14 +1243,6 @@ class Title {
                        return false;
                }
                
-               if( $wgUseLatin1 && $this->mInterwiki != '' ) {
-                       # On a Latin-1 wiki, numbered character entities may have
-                       # left us with a mix of 8-bit and UTF-8 characters, and
-                       # some of those might be Windows-1252 special chars.
-                       # Normalize interwikis to pure UTF-8.
-                       $t = Title::mergeLatin1Utf8( $t );
-               }
-
                # Fill fields
                $this->mDbkeyform = $t;
                $this->mUrlform = wfUrlencode( $t );
@@ -1870,84 +1861,5 @@ class Title {
                        && $this->getDbkey() == $title->getDbkey();
        }
 
-       /**
-        * Convert Windows-1252 extended codepoints to their real Unicode points.
-        * @param int $codepoint
-        * @return int
-        * @access private
-        */
-       function cp1252toUnicode( $codepoint ) {
-               # Mappings from:
-               # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
-               static $cp1252 = array(
-                       0x80 => 0x20AC, #EURO SIGN
-                       0x81 => UNICODE_REPLACEMENT,
-                       0x82 => 0x201A, #SINGLE LOW-9 QUOTATION MARK
-                       0x83 => 0x0192, #LATIN SMALL LETTER F WITH HOOK
-                       0x84 => 0x201E, #DOUBLE LOW-9 QUOTATION MARK
-                       0x85 => 0x2026, #HORIZONTAL ELLIPSIS
-                       0x86 => 0x2020, #DAGGER
-                       0x87 => 0x2021, #DOUBLE DAGGER
-                       0x88 => 0x02C6, #MODIFIER LETTER CIRCUMFLEX ACCENT
-                       0x89 => 0x2030, #PER MILLE SIGN
-                       0x8A => 0x0160, #LATIN CAPITAL LETTER S WITH CARON
-                       0x8B => 0x2039, #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
-                       0x8C => 0x0152, #LATIN CAPITAL LIGATURE OE
-                       0x8D => UNICODE_REPLACEMENT,
-                       0x8E => 0x017D, #LATIN CAPITAL LETTER Z WITH CARON
-                       0x8F => UNICODE_REPLACEMENT,
-                       0x90 => UNICODE_REPLACEMENT,
-                       0x91 => 0x2018, #LEFT SINGLE QUOTATION MARK
-                       0x92 => 0x2019, #RIGHT SINGLE QUOTATION MARK
-                       0x93 => 0x201C, #LEFT DOUBLE QUOTATION MARK
-                       0x94 => 0x201D, #RIGHT DOUBLE QUOTATION MARK
-                       0x95 => 0x2022, #BULLET
-                       0x96 => 0x2013, #EN DASH
-                       0x97 => 0x2014, #EM DASH
-                       0x98 => 0x02DC, #SMALL TILDE
-                       0x99 => 0x2122, #TRADE MARK SIGN
-                       0x9A => 0x0161, #LATIN SMALL LETTER S WITH CARON
-                       0x9B => 0x203A, #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
-                       0x9C => 0x0153, #LATIN SMALL LIGATURE OE
-                       0x9D => UNICODE_REPLACEMENT,
-                       0x9E => 0x017E, #LATIN SMALL LETTER Z WITH CARON
-                       0x9F => 0x0178, #LATIN CAPITAL LETTER Y WITH DIAERESIS
-                       );
-               return isset( $cp1252[$codepoint] )
-                       ? $cp1252[$codepoint]
-                       : $codepoint;
-       }
-       
-       /**
-        * HACKHACKHACK
-        * Take a string containing a mix of CP1252 characters and UTF-8 and try
-        * to convert it completely to UTF-8.
-        *
-        * @param string $string
-        * @return string
-        * @access private
-        */
-       function mergeLatin1Utf8( $string ) {
-               return preg_replace_callback(
-                       # Windows CP1252 extends ISO-8859-1 by putting extra characters
-                       # into the high control chars area. We have to convert these
-                       # to their proper Unicode counterparts.
-                       '/([\x80-\x9f])/u',
-                       create_function( '$matches',
-                               'return codepointToUtf8(
-                                       Title::cp1252toUnicode(
-                                               utf8ToCodepoint( $matches[1] ) ) );' ),
-                       preg_replace_callback(
-                               # Up-convert everything from 8-bit to UTF-8, then
-                               # filter the valid-looking UTF-8 back from the
-                               # double-converted form.
-                               '/((?:[\xc0-\xdf][\x80-\xbf]
-                                        |[\xe0-\xef][\x80-\xbf]{2}
-                                        |[\xf0-\xf7][\x80-\xbf]{3})+)/ux',
-                               create_function( '$matches',
-                                       'return utf8_decode( $matches[1] );' ),
-                               utf8_encode( $string ) ) );
-       }
-
 }
 ?>