From 5c750139a68468b6209863d5098a03c79f61ae64 Mon Sep 17 00:00:00 2001 From: Brian Wolff Date: Sat, 16 Apr 2011 15:32:19 +0000 Subject: [PATCH] (follow-up r86130) the normalizer_normalize function doesn't replace things like U+0001 which causes all sorts of pain and suffering if inserted into a page. (discovered at translatewiki) --- includes/normal/UtfNormal.php | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 75e3a08ddd..116fb8f083 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -80,18 +80,13 @@ class UtfNormal { */ static function cleanUp( $string ) { if( NORMALIZE_ICU ) { - # We exclude a few chars that ICU would not. - $string = preg_replace( - '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', - UTF8_REPLACEMENT, - $string ); - $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); - $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string ); + $string = self::replaceForNativeNormalize( $string ); # UnicodeString constructor fails if the string ends with a # head byte. Add a junk char at the end, we'll strip it off. return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); } elseif( NORMALIZE_INTL ) { + $string = self::replaceForNativeNormalize( $string ); $norm = normalizer_normalize( $string, Normalizer::FORM_C ); if( $norm === null || $norm === false ) { # normalizer_normalize will either return false or null @@ -764,4 +759,20 @@ class UtfNormal { } return $out; } + /** + * Function to replace some characters that we don't want + * but most of the native normalize functions keep. + * + * @param $string String The string + * @return String String with the character codes replaced. + */ + private static function replaceForNativeNormalize( $string ) { + $string = preg_replace( + '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', + UTF8_REPLACEMENT, + $string ); + $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); + $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string ); + return $string; + } } -- 2.20.1