From 5b5f7b30b3a0d72d00fb92a93ed9faf5484aeac6 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Mon, 15 Sep 2008 17:51:53 +0000 Subject: [PATCH] Revert r40837, r40839, r40840 (bug 332 - broken UTF-8) Char-by-char scan of all output will perform very poorly and fails to address the root problem of bad internal treatment of strings. --- RELEASE-NOTES | 1 - includes/OutputPage.php | 2 - includes/StringUtils.php | 80 ---------------------------------------- 3 files changed, 83 deletions(-) diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 6a5e129d91..a8652ae26b 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -132,7 +132,6 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN === Bug fixes in 1.14 === -* (bug 332) Clean invalid UTF-8 to ensure output is RFC 3629 compliant * (bug 14907) DatabasePostgres::fieldType now defined. * (bug 14659) Passing the default limit param to Special:Recentchanges no more falls back to the user option diff --git a/includes/OutputPage.php b/includes/OutputPage.php index 53556b9845..17a774d50d 100644 --- a/includes/OutputPage.php +++ b/includes/OutputPage.php @@ -901,8 +901,6 @@ class OutputPage { $this->addScriptFile( 'rightclickedit.js' ); } - $this->mBodytext = StringUtils::cleanForCharset( $this->mBodytext, $wgOutputEncoding ); - # Buffer output; final headers may depend on later processing ob_start(); diff --git a/includes/StringUtils.php b/includes/StringUtils.php index 1e3489024a..c437b3c19e 100644 --- a/includes/StringUtils.php +++ b/includes/StringUtils.php @@ -179,86 +179,6 @@ class StringUtils { return new ArrayIterator( explode( $separator, $subject ) ); } } - - /** - * Clean characters that are invalid in the given character set - * from a given string. - * - * @param $string \type{$string} String to clean - * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding) - * @return \type{$string} Cleaned string - */ - public static function cleanForCharset( $string, $charset='' ) { - global $wgOutputEncoding; - switch ( $charset ? $charset : $wgOutputEncoding ) { - # UTF-8 should be all we need to worry about. :) - case 'UTF-8': - return self::cleanUtf8( $string ); - default: - return $string; - } - } - - /** - * Clean invalid UTF-8 characters and sequences from a given string, - * replacing them with U+FFFD. - * Should be RFC 3629 compliant. - * - * @param $string \type{$string} String to clean - * @return \type{$string} Cleaned string - */ - private static function cleanUtf8( $str ) { - # HERE BE DRAGONS! - # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE. - - $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF, - 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF ); - $len = strlen( $str ); - $left = $bytes = 0; - for ( $i = 0; $i < $len; $i++ ) { - $ch = ord( $str[$i] ); - if ( !$left ) { - if ( !($ch & 0x80 ) ) - continue; - $left = (( $ch & 0xFE ) == 0xFC ? 5 : - (( $ch & 0xFC ) == 0xF8 ? 4 : - (( $ch & 0xF8 ) == 0xF0 ? 3 : - (( $ch & 0xF0 ) == 0xE0 ? 2 : - (( $ch & 0xE0 ) == 0xC0 ? 1 : - 0 ))))); - if ( $left ) { - $bytes = $left + 1; - $sum = $ch & ( 0xFF >> $bytes + 1 ); - continue; - } else if ( $ch & 0x80 ) { - $bytes = 1; - } - } else if ( ( $ch & 0xC0 ) == 0x80 ) { - $sum <<= 6; - $sum += $ch & 0x3F; - if ( --$left ) continue; - if ( ( $bytes == 2 && $sum < 0x80 ) || - ( $bytes == 3 && $sum < 0x800 ) || - ( $bytes == 4 && $sum < 0x10000 ) || - ( $bytes > 4 || $sum > 0x10FFFF ) || - in_array( $sum, $illegal ) ) { - } else continue; - - } else { - $bytes -= $left; - $i--; - } - - $str = ( substr( $str, 0, $i - $bytes + 1 ) . - "\xEF\xBF\xBD" . - substr( $str, $i + 1 ) ); - $i += 3 - $bytes; - $len += 3 - $bytes; - $left = 0; - } - - return $str; - } } /** -- 2.20.1