return false;
}
- global $wgUseLatin1;
- if( !$wgUseLatin1 && false !== strpos( $t, UTF8_REPLACEMENT ) ) {
+ if( false !== strpos( $t, UTF8_REPLACEMENT ) ) {
# Contained illegal UTF-8 sequences or forbidden Unicode chars.
wfProfileOut( $fname );
return false;
return false;
}
- if( $wgUseLatin1 && $this->mInterwiki != '' ) {
- # On a Latin-1 wiki, numbered character entities may have
- # left us with a mix of 8-bit and UTF-8 characters, and
- # some of those might be Windows-1252 special chars.
- # Normalize interwikis to pure UTF-8.
- $t = Title::mergeLatin1Utf8( $t );
- }
-
# Fill fields
$this->mDbkeyform = $t;
$this->mUrlform = wfUrlencode( $t );
&& $this->getDbkey() == $title->getDbkey();
}
- /**
- * Convert Windows-1252 extended codepoints to their real Unicode points.
- * @param int $codepoint
- * @return int
- * @access private
- */
- function cp1252toUnicode( $codepoint ) {
- # Mappings from:
- # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
- static $cp1252 = array(
- 0x80 => 0x20AC, #EURO SIGN
- 0x81 => UNICODE_REPLACEMENT,
- 0x82 => 0x201A, #SINGLE LOW-9 QUOTATION MARK
- 0x83 => 0x0192, #LATIN SMALL LETTER F WITH HOOK
- 0x84 => 0x201E, #DOUBLE LOW-9 QUOTATION MARK
- 0x85 => 0x2026, #HORIZONTAL ELLIPSIS
- 0x86 => 0x2020, #DAGGER
- 0x87 => 0x2021, #DOUBLE DAGGER
- 0x88 => 0x02C6, #MODIFIER LETTER CIRCUMFLEX ACCENT
- 0x89 => 0x2030, #PER MILLE SIGN
- 0x8A => 0x0160, #LATIN CAPITAL LETTER S WITH CARON
- 0x8B => 0x2039, #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
- 0x8C => 0x0152, #LATIN CAPITAL LIGATURE OE
- 0x8D => UNICODE_REPLACEMENT,
- 0x8E => 0x017D, #LATIN CAPITAL LETTER Z WITH CARON
- 0x8F => UNICODE_REPLACEMENT,
- 0x90 => UNICODE_REPLACEMENT,
- 0x91 => 0x2018, #LEFT SINGLE QUOTATION MARK
- 0x92 => 0x2019, #RIGHT SINGLE QUOTATION MARK
- 0x93 => 0x201C, #LEFT DOUBLE QUOTATION MARK
- 0x94 => 0x201D, #RIGHT DOUBLE QUOTATION MARK
- 0x95 => 0x2022, #BULLET
- 0x96 => 0x2013, #EN DASH
- 0x97 => 0x2014, #EM DASH
- 0x98 => 0x02DC, #SMALL TILDE
- 0x99 => 0x2122, #TRADE MARK SIGN
- 0x9A => 0x0161, #LATIN SMALL LETTER S WITH CARON
- 0x9B => 0x203A, #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
- 0x9C => 0x0153, #LATIN SMALL LIGATURE OE
- 0x9D => UNICODE_REPLACEMENT,
- 0x9E => 0x017E, #LATIN SMALL LETTER Z WITH CARON
- 0x9F => 0x0178, #LATIN CAPITAL LETTER Y WITH DIAERESIS
- );
- return isset( $cp1252[$codepoint] )
- ? $cp1252[$codepoint]
- : $codepoint;
- }
-
- /**
- * HACKHACKHACK
- * Take a string containing a mix of CP1252 characters and UTF-8 and try
- * to convert it completely to UTF-8.
- *
- * @param string $string
- * @return string
- * @access private
- */
- function mergeLatin1Utf8( $string ) {
- return preg_replace_callback(
- # Windows CP1252 extends ISO-8859-1 by putting extra characters
- # into the high control chars area. We have to convert these
- # to their proper Unicode counterparts.
- '/([\x80-\x9f])/u',
- create_function( '$matches',
- 'return codepointToUtf8(
- Title::cp1252toUnicode(
- utf8ToCodepoint( $matches[1] ) ) );' ),
- preg_replace_callback(
- # Up-convert everything from 8-bit to UTF-8, then
- # filter the valid-looking UTF-8 back from the
- # double-converted form.
- '/((?:[\xc0-\xdf][\x80-\xbf]
- |[\xe0-\xef][\x80-\xbf]{2}
- |[\xf0-\xf7][\x80-\xbf]{3})+)/ux',
- create_function( '$matches',
- 'return utf8_decode( $matches[1] );' ),
- utf8_encode( $string ) ) );
- }
-
}
?>