From ad5f1acdb3cc78c2607e08862c5a4aa6c8f0bc1c Mon Sep 17 00:00:00 2001 From: Fran Rogers Date: Mon, 15 Sep 2008 00:42:17 +0000 Subject: [PATCH] Fix for bug #332 - all UTF-8 output is now cleaned of invalid forms as defined by RFC 3629. All output from MediaWiki should now be valid UTF-8 in all circumstances. --- includes/OutputPage.php | 9 ++++- includes/StringUtils.php | 80 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/includes/OutputPage.php b/includes/OutputPage.php index 17a774d50d..c3138422bf 100644 --- a/includes/OutputPage.php +++ b/includes/OutputPage.php @@ -902,7 +902,7 @@ class OutputPage { } # Buffer output; final headers may depend on later processing - ob_start(); + ob_start( array( 'OutputPage', 'cleanCallback') ); $wgRequest->response()->header( "Content-type: $wgMimeType; charset={$wgOutputEncoding}" ); $wgRequest->response()->header( 'Content-language: '.$wgContLanguageCode ); @@ -924,6 +924,13 @@ class OutputPage { wfProfileOut( __METHOD__ ); } + public static function cleanCallback( $s ) { + wfProfileIn( __METHOD__ ); + $s = StringUtils::cleanForCharset( $s, $wgOutputEncoding ); + wfProfileOut( __METHOD__ ); + return $s; + } + /** * @todo document * @param string $ins diff --git a/includes/StringUtils.php b/includes/StringUtils.php index c437b3c19e..1e3489024a 100644 --- a/includes/StringUtils.php +++ b/includes/StringUtils.php @@ -179,6 +179,86 @@ class StringUtils { return new ArrayIterator( explode( $separator, $subject ) ); } } + + /** + * Clean characters that are invalid in the given character set + * from a given string. + * + * @param $string \type{$string} String to clean + * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding) + * @return \type{$string} Cleaned string + */ + public static function cleanForCharset( $string, $charset='' ) { + global $wgOutputEncoding; + switch ( $charset ? $charset : $wgOutputEncoding ) { + # UTF-8 should be all we need to worry about. :) + case 'UTF-8': + return self::cleanUtf8( $string ); + default: + return $string; + } + } + + /** + * Clean invalid UTF-8 characters and sequences from a given string, + * replacing them with U+FFFD. + * Should be RFC 3629 compliant. + * + * @param $string \type{$string} String to clean + * @return \type{$string} Cleaned string + */ + private static function cleanUtf8( $str ) { + # HERE BE DRAGONS! + # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE. + + $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF, + 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF ); + $len = strlen( $str ); + $left = $bytes = 0; + for ( $i = 0; $i < $len; $i++ ) { + $ch = ord( $str[$i] ); + if ( !$left ) { + if ( !($ch & 0x80 ) ) + continue; + $left = (( $ch & 0xFE ) == 0xFC ? 5 : + (( $ch & 0xFC ) == 0xF8 ? 4 : + (( $ch & 0xF8 ) == 0xF0 ? 3 : + (( $ch & 0xF0 ) == 0xE0 ? 2 : + (( $ch & 0xE0 ) == 0xC0 ? 1 : + 0 ))))); + if ( $left ) { + $bytes = $left + 1; + $sum = $ch & ( 0xFF >> $bytes + 1 ); + continue; + } else if ( $ch & 0x80 ) { + $bytes = 1; + } + } else if ( ( $ch & 0xC0 ) == 0x80 ) { + $sum <<= 6; + $sum += $ch & 0x3F; + if ( --$left ) continue; + if ( ( $bytes == 2 && $sum < 0x80 ) || + ( $bytes == 3 && $sum < 0x800 ) || + ( $bytes == 4 && $sum < 0x10000 ) || + ( $bytes > 4 || $sum > 0x10FFFF ) || + in_array( $sum, $illegal ) ) { + } else continue; + + } else { + $bytes -= $left; + $i--; + } + + $str = ( substr( $str, 0, $i - $bytes + 1 ) . + "\xEF\xBF\xBD" . + substr( $str, $i + 1 ) ); + $i += 3 - $bytes; + $len += 3 - $bytes; + $left = 0; + } + + return $str; + } } /** -- 2.20.1