From a518a5081af7dabdeb974c4b267c9efbb9b85b1b Mon Sep 17 00:00:00 2001 From: Brian Wolff Date: Fri, 15 Apr 2011 18:39:43 +0000 Subject: [PATCH] (follow-up r69626) Make it so the intl normalizer_normalize function is not fed an invalid sequence in UtfNormal::cleanUp normalizer_normalize seems to return false if fed an invalid unicode sequence (Which is quite different from what our built in normalization functions do). So use quickIsNFC if it returns false. (Noticed when investigating bug 28541). --- RELEASE-NOTES | 1 + includes/normal/UtfNormal.php | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 4fb62c311c..9af339d6f8 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -237,6 +237,7 @@ PHP if you have not done so prior to upgrading MediaWiki. * (bug 27473) Fix regression: bold, italic no longer interfere with linktrail for ca, kaa * (bug 28444) Fix regression: edit-on-doubleclick retains revision id again * ' character entity is now allowed in wikitext +* UtfNormal::cleanUp on an invalid utf-8 sequence no longer returns false if intl installed. === API changes in 1.18 === * (bug 26339) Throw warning when truncating an overlarge API result diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 84ebc76278..75e3a08ddd 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -79,7 +79,7 @@ class UtfNormal { * @return string a clean, shiny, normalized UTF-8 string */ static function cleanUp( $string ) { - if( NORMALIZE_ICU || NORMALIZE_INTL ) { + if( NORMALIZE_ICU ) { # We exclude a few chars that ICU would not. $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', @@ -90,8 +90,24 @@ class UtfNormal { # UnicodeString constructor fails if the string ends with a # head byte. Add a junk char at the end, we'll strip it off. - if ( NORMALIZE_ICU ) return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); - if ( NORMALIZE_INTL ) return normalizer_normalize( $string, Normalizer::FORM_C ); + return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); + } elseif( NORMALIZE_INTL ) { + $norm = normalizer_normalize( $string, Normalizer::FORM_C ); + if( $norm === null || $norm === false ) { + # normalizer_normalize will either return false or null + # (depending on which doc you read) if invalid utf8 string. + # quickIsNFCVerify cleans up invalid sequences. + + if( UtfNormal::quickIsNFCVerify( $string ) ) { + # if that's true, the string is actually already normal. + return $string; + } else { + # Now we are valid but non-normal + return normalizer_normalize( $string, Normalizer::FORM_C ); + } + } else { + return $norm; + } } elseif( UtfNormal::quickIsNFCVerify( $string ) ) { # Side effect -- $string has had UTF-8 errors cleaned up. return $string; -- 2.20.1