}
# Buffer output; final headers may depend on later processing
- ob_start();
+ ob_start( array( 'OutputPage', 'cleanCallback') );
$wgRequest->response()->header( "Content-type: $wgMimeType; charset={$wgOutputEncoding}" );
$wgRequest->response()->header( 'Content-language: '.$wgContLanguageCode );
wfProfileOut( __METHOD__ );
}
+ public static function cleanCallback( $s ) {
+ wfProfileIn( __METHOD__ );
+ $s = StringUtils::cleanForCharset( $s, $wgOutputEncoding );
+ wfProfileOut( __METHOD__ );
+ return $s;
+ }
+
/**
* @todo document
* @param string $ins
return new ArrayIterator( explode( $separator, $subject ) );
}
}
+
+ /**
+ * Clean characters that are invalid in the given character set
+ * from a given string.
+ *
+ * @param $string \type{$string} String to clean
+ * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
+ * @return \type{$string} Cleaned string
+ */
+ public static function cleanForCharset( $string, $charset='' ) {
+ global $wgOutputEncoding;
+ switch ( $charset ? $charset : $wgOutputEncoding ) {
+ # UTF-8 should be all we need to worry about. :)
+ case 'UTF-8':
+ return self::cleanUtf8( $string );
+ default:
+ return $string;
+ }
+ }
+
+ /**
+ * Clean invalid UTF-8 characters and sequences from a given string,
+ * replacing them with U+FFFD.
+ * Should be RFC 3629 compliant.
+ *
+ * @param $string \type{$string} String to clean
+ * @return \type{$string} Cleaned string
+ */
+ private static function cleanUtf8( $str ) {
+ # HERE BE DRAGONS!
+ # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
+
+ $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
+ 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
+ $len = strlen( $str );
+ $left = $bytes = 0;
+ for ( $i = 0; $i < $len; $i++ ) {
+ $ch = ord( $str[$i] );
+ if ( !$left ) {
+ if ( !($ch & 0x80 ) )
+ continue;
+ $left = (( $ch & 0xFE ) == 0xFC ? 5 :
+ (( $ch & 0xFC ) == 0xF8 ? 4 :
+ (( $ch & 0xF8 ) == 0xF0 ? 3 :
+ (( $ch & 0xF0 ) == 0xE0 ? 2 :
+ (( $ch & 0xE0 ) == 0xC0 ? 1 :
+ 0 )))));
+ if ( $left ) {
+ $bytes = $left + 1;
+ $sum = $ch & ( 0xFF >> $bytes + 1 );
+ continue;
+ } else if ( $ch & 0x80 ) {
+ $bytes = 1;
+ }
+ } else if ( ( $ch & 0xC0 ) == 0x80 ) {
+ $sum <<= 6;
+ $sum += $ch & 0x3F;
+ if ( --$left ) continue;
+ if ( ( $bytes == 2 && $sum < 0x80 ) ||
+ ( $bytes == 3 && $sum < 0x800 ) ||
+ ( $bytes == 4 && $sum < 0x10000 ) ||
+ ( $bytes > 4 || $sum > 0x10FFFF ) ||
+ in_array( $sum, $illegal ) ) {
+ } else continue;
+
+ } else {
+ $bytes -= $left;
+ $i--;
+ }
+
+ $str = ( substr( $str, 0, $i - $bytes + 1 ) .
+ "\xEF\xBF\xBD" .
+ substr( $str, $i + 1 ) );
+ $i += 3 - $bytes;
+ $len += 3 - $bytes;
+ $left = 0;
+ }
+
+ return $str;
+ }
}
/**