From ad5f1acdb3cc78c2607e08862c5a4aa6c8f0bc1c Mon Sep 17 00:00:00 2001
From: Fran Rogers <krimpet@users.mediawiki.org>
Date: Mon, 15 Sep 2008 00:42:17 +0000
Subject: [PATCH] Fix for bug #332 - all UTF-8 output is now cleaned of invalid
 forms as defined by RFC 3629. All output from MediaWiki should now be valid
 UTF-8 in all circumstances.

---
 includes/OutputPage.php  |  9 ++++-
 includes/StringUtils.php | 80 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/includes/OutputPage.php b/includes/OutputPage.php
index 17a774d50d..c3138422bf 100644
--- a/includes/OutputPage.php
+++ b/includes/OutputPage.php
@@ -902,7 +902,7 @@ class OutputPage {
 		}
 
 		# Buffer output; final headers may depend on later processing
-		ob_start();
+		ob_start( array( 'OutputPage', 'cleanCallback') );
 
 		$wgRequest->response()->header( "Content-type: $wgMimeType; charset={$wgOutputEncoding}" );
 		$wgRequest->response()->header( 'Content-language: '.$wgContLanguageCode );
@@ -924,6 +924,13 @@ class OutputPage {
 		wfProfileOut( __METHOD__ );
 	}
 
+	public static function cleanCallback( $s ) {
+		wfProfileIn( __METHOD__ );
+		$s = StringUtils::cleanForCharset( $s, $wgOutputEncoding );
+		wfProfileOut( __METHOD__ );
+		return $s;
+	}
+
 	/**
 	 * @todo document
 	 * @param string $ins
diff --git a/includes/StringUtils.php b/includes/StringUtils.php
index c437b3c19e..1e3489024a 100644
--- a/includes/StringUtils.php
+++ b/includes/StringUtils.php
@@ -179,6 +179,86 @@ class StringUtils {
 			return new ArrayIterator( explode( $separator, $subject ) );
 		}
 	}
+
+	/**
+	 * Clean characters that are invalid in the given character set 
+	 * from a given string.
+	 * 
+	 * @param $string \type{$string} String to clean
+	 * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
+	 * @return \type{$string} Cleaned string
+	 */
+	public static function cleanForCharset( $string, $charset='' ) {
+		global $wgOutputEncoding;
+		switch ( $charset ? $charset : $wgOutputEncoding ) {
+			# UTF-8 should be all we need to worry about. :)
+		case 'UTF-8':
+			return self::cleanUtf8( $string );
+		default:
+			return $string;
+		}
+	}
+
+	/**
+	 * Clean invalid UTF-8 characters and sequences from a given string,
+	 * replacing them with U+FFFD.
+	 * Should be RFC 3629 compliant.
+	 * 
+	 * @param $string \type{$string} String to clean
+	 * @return \type{$string} Cleaned string
+	 */
+	private static function cleanUtf8( $str ) {
+		# HERE BE DRAGONS!
+		# ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
+
+		$illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
+				  0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
+		$len = strlen( $str );
+		$left = $bytes = 0;
+		for ( $i = 0; $i < $len; $i++ ) {
+			$ch = ord( $str[$i] );
+			if ( !$left ) {
+				if ( !($ch & 0x80 ) )
+					continue;
+				$left = (( $ch & 0xFE ) == 0xFC ? 5 :
+					(( $ch & 0xFC ) == 0xF8 ? 4 :
+					(( $ch & 0xF8 ) == 0xF0 ? 3 :
+					(( $ch & 0xF0 ) == 0xE0 ? 2 :
+					(( $ch & 0xE0 ) == 0xC0 ? 1 :
+					                          0 )))));
+				if ( $left ) {
+					$bytes = $left + 1;
+					$sum = $ch & ( 0xFF >> $bytes + 1 );
+					continue;
+				} else if ( $ch & 0x80 ) {
+					$bytes = 1;
+				}
+			} else if ( ( $ch & 0xC0 ) == 0x80 ) {
+				$sum <<= 6;
+				$sum += $ch & 0x3F;
+				if ( --$left ) continue;
+				if ( ( $bytes == 2 && $sum < 0x80     ) ||
+				     ( $bytes == 3 && $sum < 0x800    ) ||
+				     ( $bytes == 4 && $sum < 0x10000  ) ||
+				     ( $bytes >  4 || $sum > 0x10FFFF ) ||
+				     in_array( $sum, $illegal ) ) {
+				} else continue;
+				
+			} else {
+				$bytes -= $left;
+				$i--;
+			}
+
+			$str = ( substr( $str, 0, $i - $bytes + 1 ) .
+				 "\xEF\xBF\xBD" .
+				 substr( $str, $i + 1 ) );
+			$i   += 3 - $bytes;
+			$len += 3 - $bytes;
+			$left = 0;
+		}
+
+		return $str;
+	}
 }
 
 /**
-- 
2.20.1