return false;
}
+ if( $wgUseLatin1 && $this->mInterwiki != '' ) {
+ # On a Latin-1 wiki, numbered character entities may have
+ # left us with a mix of 8-bit and UTF-8 characters, and
+ # some of those might be Windows-1252 special chars.
+ # Normalize interwikis to pure UTF-8.
+ $t = Title::mergeLatin1Utf8( $t );
+ }
+
# Fill fields
$this->mDbkeyform = $t;
$this->mUrlform = wfUrlencode( $t );
&& $this->getDbkey() == $title->getDbkey();
}
+ /**
+ * Convert Windows-1252 extended codepoints to their real Unicode points.
+ * @param int $codepoint
+ * @return int
+ * @access private
+ */
+ function cp1252toUnicode( $codepoint ) {
+ # Mappings from:
+ # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
+ static $cp1252 = array(
+ 0x80 => 0x20AC, #EURO SIGN
+ 0x81 => UNICODE_REPLACEMENT,
+ 0x82 => 0x201A, #SINGLE LOW-9 QUOTATION MARK
+ 0x83 => 0x0192, #LATIN SMALL LETTER F WITH HOOK
+ 0x84 => 0x201E, #DOUBLE LOW-9 QUOTATION MARK
+ 0x85 => 0x2026, #HORIZONTAL ELLIPSIS
+ 0x86 => 0x2020, #DAGGER
+ 0x87 => 0x2021, #DOUBLE DAGGER
+ 0x88 => 0x02C6, #MODIFIER LETTER CIRCUMFLEX ACCENT
+ 0x89 => 0x2030, #PER MILLE SIGN
+ 0x8A => 0x0160, #LATIN CAPITAL LETTER S WITH CARON
+ 0x8B => 0x2039, #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ 0x8C => 0x0152, #LATIN CAPITAL LIGATURE OE
+ 0x8D => UNICODE_REPLACEMENT,
+ 0x8E => 0x017D, #LATIN CAPITAL LETTER Z WITH CARON
+ 0x8F => UNICODE_REPLACEMENT,
+ 0x90 => UNICODE_REPLACEMENT,
+ 0x91 => 0x2018, #LEFT SINGLE QUOTATION MARK
+ 0x92 => 0x2019, #RIGHT SINGLE QUOTATION MARK
+ 0x93 => 0x201C, #LEFT DOUBLE QUOTATION MARK
+ 0x94 => 0x201D, #RIGHT DOUBLE QUOTATION MARK
+ 0x95 => 0x2022, #BULLET
+ 0x96 => 0x2013, #EN DASH
+ 0x97 => 0x2014, #EM DASH
+ 0x98 => 0x02DC, #SMALL TILDE
+ 0x99 => 0x2122, #TRADE MARK SIGN
+ 0x9A => 0x0161, #LATIN SMALL LETTER S WITH CARON
+ 0x9B => 0x203A, #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ 0x9C => 0x0153, #LATIN SMALL LIGATURE OE
+ 0x9D => UNICODE_REPLACEMENT,
+ 0x9E => 0x017E, #LATIN SMALL LETTER Z WITH CARON
+ 0x9F => 0x0178, #LATIN CAPITAL LETTER Y WITH DIAERESIS
+ );
+ return isset( $cp1252[$codepoint] )
+ ? $cp1252[$codepoint]
+ : $codepoint;
+ }
+
+ /**
+ * HACKHACKHACK
+ * Take a string containing a mix of CP1252 characters and UTF-8 and try
+ * to convert it completely to UTF-8.
+ *
+ * @param string $string
+ * @return string
+ * @access private
+ */
+ function mergeLatin1Utf8( $string ) {
+ return preg_replace_callback(
+ # Windows CP1252 extends ISO-8859-1 by putting extra characters
+ # into the high control chars area. We have to convert these
+ # to their proper Unicode counterparts.
+ '/([\x80-\x9f])/u',
+ create_function( '$matches',
+ 'return codepointToUtf8(
+ Title::cp1252toUnicode(
+ utf8ToCodepoint( $matches[1] ) ) );' ),
+ preg_replace_callback(
+ # Up-convert everything from 8-bit to UTF-8, then
+ # filter the valid-looking UTF-8 back from the
+ # double-converted form.
+ '/((?:[\xc0-\xdf][\x80-\xbf]
+ |[\xe0-\xef][\x80-\xbf]{2}
+ |[\xf0-\xf7][\x80-\xbf]{3})+)/ux',
+ create_function( '$matches',
+ 'return utf8_decode( $matches[1] );' ),
+ utf8_encode( $string ) ) );
+ }
+
}
?>