From: Brion Vibber <brion@users.mediawiki.org>
Date: Sun, 27 Feb 2005 04:11:41 +0000 (+0000)
Subject: * (bug 65) Fix broken interwiki link encoding on Latin-1 wikis; force to UTF-8
X-Git-Tag: 1.5.0alpha1~692
X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/exercices/journal.php?a=commitdiff_plain;h=f2543cafd7fc7dcbb68713b31c0d7658ca07dd5c;p=lhc%2Fweb%2Fwiklou.git

* (bug 65) Fix broken interwiki link encoding on Latin-1 wikis; force to UTF-8
---

diff --git a/includes/Title.php b/includes/Title.php
index a9163a2fe9..850ad4e49a 100644
--- a/includes/Title.php
+++ b/includes/Title.php
@@ -1225,6 +1225,14 @@ class Title {
 			return false;
 		}
 		
+		if( $wgUseLatin1 && $this->mInterwiki != '' ) {
+			# On a Latin-1 wiki, numbered character entities may have
+			# left us with a mix of 8-bit and UTF-8 characters, and
+			# some of those might be Windows-1252 special chars.
+			# Normalize interwikis to pure UTF-8.
+			$t = Title::mergeLatin1Utf8( $t );
+		}
+
 		# Fill fields
 		$this->mDbkeyform = $t;
 		$this->mUrlform = wfUrlencode( $t );
@@ -1908,5 +1916,84 @@ class Title {
 			&& $this->getDbkey() == $title->getDbkey();
 	}
 
+	/**
+	 * Convert Windows-1252 extended codepoints to their real Unicode points.
+	 * @param int $codepoint
+	 * @return int
+	 * @access private
+	 */
+	function cp1252toUnicode( $codepoint ) {
+		# Mappings from:
+		# http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
+		static $cp1252 = array(
+			0x80 => 0x20AC,	#EURO SIGN
+			0x81 => UNICODE_REPLACEMENT,
+			0x82 => 0x201A,	#SINGLE LOW-9 QUOTATION MARK
+			0x83 => 0x0192,	#LATIN SMALL LETTER F WITH HOOK
+			0x84 => 0x201E,	#DOUBLE LOW-9 QUOTATION MARK
+			0x85 => 0x2026,	#HORIZONTAL ELLIPSIS
+			0x86 => 0x2020,	#DAGGER
+			0x87 => 0x2021,	#DOUBLE DAGGER
+			0x88 => 0x02C6,	#MODIFIER LETTER CIRCUMFLEX ACCENT
+			0x89 => 0x2030,	#PER MILLE SIGN
+			0x8A => 0x0160,	#LATIN CAPITAL LETTER S WITH CARON
+			0x8B => 0x2039,	#SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+			0x8C => 0x0152,	#LATIN CAPITAL LIGATURE OE
+			0x8D => UNICODE_REPLACEMENT,
+			0x8E => 0x017D,	#LATIN CAPITAL LETTER Z WITH CARON
+			0x8F => UNICODE_REPLACEMENT,
+			0x90 => UNICODE_REPLACEMENT,
+			0x91 => 0x2018,	#LEFT SINGLE QUOTATION MARK
+			0x92 => 0x2019,	#RIGHT SINGLE QUOTATION MARK
+			0x93 => 0x201C,	#LEFT DOUBLE QUOTATION MARK
+			0x94 => 0x201D,	#RIGHT DOUBLE QUOTATION MARK
+			0x95 => 0x2022,	#BULLET
+			0x96 => 0x2013,	#EN DASH
+			0x97 => 0x2014,	#EM DASH
+			0x98 => 0x02DC,	#SMALL TILDE
+			0x99 => 0x2122,	#TRADE MARK SIGN
+			0x9A => 0x0161,	#LATIN SMALL LETTER S WITH CARON
+			0x9B => 0x203A,	#SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+			0x9C => 0x0153,	#LATIN SMALL LIGATURE OE
+			0x9D => UNICODE_REPLACEMENT,
+			0x9E => 0x017E,	#LATIN SMALL LETTER Z WITH CARON
+			0x9F => 0x0178,	#LATIN CAPITAL LETTER Y WITH DIAERESIS
+			);
+		return isset( $cp1252[$codepoint] )
+			? $cp1252[$codepoint]
+			: $codepoint;
+	}
+	
+	/**
+	 * HACKHACKHACK
+	 * Take a string containing a mix of CP1252 characters and UTF-8 and try
+	 * to convert it completely to UTF-8.
+	 *
+	 * @param string $string
+	 * @return string
+	 * @access private
+	 */
+	function mergeLatin1Utf8( $string ) {
+		return preg_replace_callback(
+			# Windows CP1252 extends ISO-8859-1 by putting extra characters
+			# into the high control chars area. We have to convert these
+			# to their proper Unicode counterparts.
+			'/([\x80-\x9f])/u',
+			create_function( '$matches',
+				'return codepointToUtf8(
+					Title::cp1252toUnicode(
+						utf8ToCodepoint( $matches[1] ) ) );' ),
+			preg_replace_callback(
+				# Up-convert everything from 8-bit to UTF-8, then
+				# filter the valid-looking UTF-8 back from the
+				# double-converted form.
+				'/((?:[\xc0-\xdf][\x80-\xbf]
+					 |[\xe0-\xef][\x80-\xbf]{2}
+					 |[\xf0-\xf7][\x80-\xbf]{3})+)/ux',
+				create_function( '$matches',
+					'return utf8_decode( $matches[1] );' ),
+				utf8_encode( $string ) ) );
+	}
+
 }
 ?>