Revert r40837, r40839, r40840 (bug 332 - broken UTF-8)
authorBrion Vibber <brion@users.mediawiki.org>
Mon, 15 Sep 2008 17:51:53 +0000 (17:51 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Mon, 15 Sep 2008 17:51:53 +0000 (17:51 +0000)
Char-by-char scan of all output will perform very poorly and fails to address the root problem of bad internal treatment of strings.

RELEASE-NOTES
includes/OutputPage.php
includes/StringUtils.php

index 6a5e129..a8652ae 100644 (file)
@@ -132,7 +132,6 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
 
 === Bug fixes in 1.14 ===
 
-* (bug 332) Clean invalid UTF-8 to ensure output is RFC 3629 compliant
 * (bug 14907) DatabasePostgres::fieldType now defined.
 * (bug 14659) Passing the default limit param to Special:Recentchanges no more
   falls back to the user option
index 53556b9..17a774d 100644 (file)
@@ -901,8 +901,6 @@ class OutputPage {
                        $this->addScriptFile( 'rightclickedit.js' );
                }
 
-               $this->mBodytext = StringUtils::cleanForCharset( $this->mBodytext, $wgOutputEncoding );
-
                # Buffer output; final headers may depend on later processing
                ob_start();
 
index 1e34890..c437b3c 100644 (file)
@@ -179,86 +179,6 @@ class StringUtils {
                        return new ArrayIterator( explode( $separator, $subject ) );
                }
        }
-
-       /**
-        * Clean characters that are invalid in the given character set 
-        * from a given string.
-        * 
-        * @param $string \type{$string} String to clean
-        * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
-        * @return \type{$string} Cleaned string
-        */
-       public static function cleanForCharset( $string, $charset='' ) {
-               global $wgOutputEncoding;
-               switch ( $charset ? $charset : $wgOutputEncoding ) {
-                       # UTF-8 should be all we need to worry about. :)
-               case 'UTF-8':
-                       return self::cleanUtf8( $string );
-               default:
-                       return $string;
-               }
-       }
-
-       /**
-        * Clean invalid UTF-8 characters and sequences from a given string,
-        * replacing them with U+FFFD.
-        * Should be RFC 3629 compliant.
-        * 
-        * @param $string \type{$string} String to clean
-        * @return \type{$string} Cleaned string
-        */
-       private static function cleanUtf8( $str ) {
-               # HERE BE DRAGONS!
-               # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
-
-               $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
-                                 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
-               $len = strlen( $str );
-               $left = $bytes = 0;
-               for ( $i = 0; $i < $len; $i++ ) {
-                       $ch = ord( $str[$i] );
-                       if ( !$left ) {
-                               if ( !($ch & 0x80 ) )
-                                       continue;
-                               $left = (( $ch & 0xFE ) == 0xFC ? 5 :
-                                       (( $ch & 0xFC ) == 0xF8 ? 4 :
-                                       (( $ch & 0xF8 ) == 0xF0 ? 3 :
-                                       (( $ch & 0xF0 ) == 0xE0 ? 2 :
-                                       (( $ch & 0xE0 ) == 0xC0 ? 1 :
-                                                                 0 )))));
-                               if ( $left ) {
-                                       $bytes = $left + 1;
-                                       $sum = $ch & ( 0xFF >> $bytes + 1 );
-                                       continue;
-                               } else if ( $ch & 0x80 ) {
-                                       $bytes = 1;
-                               }
-                       } else if ( ( $ch & 0xC0 ) == 0x80 ) {
-                               $sum <<= 6;
-                               $sum += $ch & 0x3F;
-                               if ( --$left ) continue;
-                               if ( ( $bytes == 2 && $sum < 0x80     ) ||
-                                    ( $bytes == 3 && $sum < 0x800    ) ||
-                                    ( $bytes == 4 && $sum < 0x10000  ) ||
-                                    ( $bytes >  4 || $sum > 0x10FFFF ) ||
-                                    in_array( $sum, $illegal ) ) {
-                               } else continue;
-                               
-                       } else {
-                               $bytes -= $left;
-                               $i--;
-                       }
-
-                       $str = ( substr( $str, 0, $i - $bytes + 1 ) .
-                                "\xEF\xBF\xBD" .
-                                substr( $str, $i + 1 ) );
-                       $i   += 3 - $bytes;
-                       $len += 3 - $bytes;
-                       $left = 0;
-               }
-
-               return $str;
-       }
 }
 
 /**