Fix for bug #332 - all UTF-8 output is now cleaned of invalid forms as defined by...
authorFran Rogers <krimpet@users.mediawiki.org>
Mon, 15 Sep 2008 00:42:17 +0000 (00:42 +0000)
committerFran Rogers <krimpet@users.mediawiki.org>
Mon, 15 Sep 2008 00:42:17 +0000 (00:42 +0000)
includes/OutputPage.php
includes/StringUtils.php

index 17a774d..c313842 100644 (file)
@@ -902,7 +902,7 @@ class OutputPage {
                }
 
                # Buffer output; final headers may depend on later processing
-               ob_start();
+               ob_start( array( 'OutputPage', 'cleanCallback') );
 
                $wgRequest->response()->header( "Content-type: $wgMimeType; charset={$wgOutputEncoding}" );
                $wgRequest->response()->header( 'Content-language: '.$wgContLanguageCode );
@@ -924,6 +924,13 @@ class OutputPage {
                wfProfileOut( __METHOD__ );
        }
 
+       public static function cleanCallback( $s ) {
+               wfProfileIn( __METHOD__ );
+               $s = StringUtils::cleanForCharset( $s, $wgOutputEncoding );
+               wfProfileOut( __METHOD__ );
+               return $s;
+       }
+
        /**
         * @todo document
         * @param string $ins
index c437b3c..1e34890 100644 (file)
@@ -179,6 +179,86 @@ class StringUtils {
                        return new ArrayIterator( explode( $separator, $subject ) );
                }
        }
+
+       /**
+        * Clean characters that are invalid in the given character set 
+        * from a given string.
+        * 
+        * @param $string \type{$string} String to clean
+        * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
+        * @return \type{$string} Cleaned string
+        */
+       public static function cleanForCharset( $string, $charset='' ) {
+               global $wgOutputEncoding;
+               switch ( $charset ? $charset : $wgOutputEncoding ) {
+                       # UTF-8 should be all we need to worry about. :)
+               case 'UTF-8':
+                       return self::cleanUtf8( $string );
+               default:
+                       return $string;
+               }
+       }
+
+       /**
+        * Clean invalid UTF-8 characters and sequences from a given string,
+        * replacing them with U+FFFD.
+        * Should be RFC 3629 compliant.
+        * 
+        * @param $string \type{$string} String to clean
+        * @return \type{$string} Cleaned string
+        */
+       private static function cleanUtf8( $str ) {
+               # HERE BE DRAGONS!
+               # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
+
+               $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
+                                 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
+               $len = strlen( $str );
+               $left = $bytes = 0;
+               for ( $i = 0; $i < $len; $i++ ) {
+                       $ch = ord( $str[$i] );
+                       if ( !$left ) {
+                               if ( !($ch & 0x80 ) )
+                                       continue;
+                               $left = (( $ch & 0xFE ) == 0xFC ? 5 :
+                                       (( $ch & 0xFC ) == 0xF8 ? 4 :
+                                       (( $ch & 0xF8 ) == 0xF0 ? 3 :
+                                       (( $ch & 0xF0 ) == 0xE0 ? 2 :
+                                       (( $ch & 0xE0 ) == 0xC0 ? 1 :
+                                                                 0 )))));
+                               if ( $left ) {
+                                       $bytes = $left + 1;
+                                       $sum = $ch & ( 0xFF >> $bytes + 1 );
+                                       continue;
+                               } else if ( $ch & 0x80 ) {
+                                       $bytes = 1;
+                               }
+                       } else if ( ( $ch & 0xC0 ) == 0x80 ) {
+                               $sum <<= 6;
+                               $sum += $ch & 0x3F;
+                               if ( --$left ) continue;
+                               if ( ( $bytes == 2 && $sum < 0x80     ) ||
+                                    ( $bytes == 3 && $sum < 0x800    ) ||
+                                    ( $bytes == 4 && $sum < 0x10000  ) ||
+                                    ( $bytes >  4 || $sum > 0x10FFFF ) ||
+                                    in_array( $sum, $illegal ) ) {
+                               } else continue;
+                               
+                       } else {
+                               $bytes -= $left;
+                               $i--;
+                       }
+
+                       $str = ( substr( $str, 0, $i - $bytes + 1 ) .
+                                "\xEF\xBF\xBD" .
+                                substr( $str, $i + 1 ) );
+                       $i   += 3 - $bytes;
+                       $len += 3 - $bytes;
+                       $left = 0;
+               }
+
+               return $str;
+       }
 }
 
 /**