From e4e75a58a6eace77977adbaa1c9199be6f432d7f Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 14 Nov 2004 05:17:29 +0000 Subject: [PATCH] Support using ICU to do most of the heavy lifting in cleanUp() if the extension is loaded. Modestly faster for roman text (1-2x), 16-20x faster than the PHP looping for already normalized Russian, Japanese, and Korean text. --- includes/normal/CleanUpTest.php | 14 ++++++++++++++ includes/normal/Makefile | 15 ++++++++++++++- includes/normal/UtfNormal.php | 17 +++++++++++++++-- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/includes/normal/CleanUpTest.php b/includes/normal/CleanUpTest.php index 0eb4058819..badade6d1e 100644 --- a/includes/normal/CleanUpTest.php +++ b/includes/normal/CleanUpTest.php @@ -316,6 +316,20 @@ class CleanUpTest extends PHPUnit_TestCase { bin2hex( $expect ), bin2hex( UtfNormal::cleanUp( $text ) ) ); } + + function testBomRegression() { + $text = "\xef\xbf\xbe" . # U+FFFE, illegal char + "\xb2" . # bad tail + "\xef" . # bad head + "\x59"; + $expect = "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\x59"; + $this->assertEquals( + bin2hex( $expect ), + bin2hex( UtfNormal::cleanUp( $text ) ) ); + } } diff --git a/includes/normal/Makefile b/includes/normal/Makefile index 1042e12711..fcdf238073 100644 --- a/includes/normal/Makefile +++ b/includes/normal/Makefile @@ -1,3 +1,5 @@ +.PHONY : all test testutf8 testclean icutest bench icubench clean distclean + FETCH=wget #FETCH=fetch BASE=http://www.unicode.org/Public/UNIDATA @@ -9,15 +11,26 @@ all : UtfNormalData.inc UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt $(PHP) UtfNormalGenerate.php -test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt +test : testutf8 testclean UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt $(PHP) UtfNormalTest.php testutf8 : Utf8Test.php UTF-8-test.txt $(PHP) Utf8Test.php +testclean : CleanUpTest.php + $(PHP) CleanUpTest.php + bench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt $(PHP) UtfNormalBench.php +icutest : UtfNormalData.inc NormalizationTest.txt + $(PHP) Utf8Test.php --icu + $(PHP) CleanUpTest.php --icu + $(PHP) UtfNormalTest.php --icu + +icubench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt + $(PHP) UtfNormalBench.php --icu + clean : rm -f UtfNormalData.inc diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 254e9c0472..a4c095c904 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -126,10 +126,23 @@ class UtfNormal { * @return string a clean, shiny, normalized UTF-8 string */ function cleanUp( $string ) { - if( UtfNormal::quickIsNFCVerify( $string ) ) + if( NORMALIZE_ICU ) { + # We exclude a few chars that ICU would not. + $string = preg_replace( + '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', + UTF8_REPLACEMENT, + $string ); + $str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string ); + + # UnicodeString constructor fails if the string ends with a + # head byte. Add a junk char at the end, we'll strip it off. + return rtrim( utf8_normalize( $str . "\x01", UNORM_NFC ), "\x01" ); + } elseif( UtfNormal::quickIsNFCVerify( $string ) ) { + # Side effect -- $string has had UTF-8 errors cleaned up. return $string; - else + } else { return UtfNormal::NFC( $string ); + } } /** -- 2.20.1