Modestly faster for roman text (1-2x), 16-20x faster than the PHP looping for already normalized Russian, Japanese, and Korean text.
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
+
+ function testBomRegression() {
+ $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
+ "\xb2" . # bad tail
+ "\xef" . # bad head
+ "\x59";
+ $expect = "\xef\xbf\xbd" .
+ "\xef\xbf\xbd" .
+ "\xef\xbf\xbd" .
+ "\x59";
+ $this->assertEquals(
+ bin2hex( $expect ),
+ bin2hex( UtfNormal::cleanUp( $text ) ) );
+ }
}
+.PHONY : all test testutf8 testclean icutest bench icubench clean distclean
+
FETCH=wget
#FETCH=fetch
BASE=http://www.unicode.org/Public/UNIDATA
UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
$(PHP) UtfNormalGenerate.php
-test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
+test : testutf8 testclean UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
$(PHP) UtfNormalTest.php
testutf8 : Utf8Test.php UTF-8-test.txt
$(PHP) Utf8Test.php
+testclean : CleanUpTest.php
+ $(PHP) CleanUpTest.php
+
bench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt
$(PHP) UtfNormalBench.php
+icutest : UtfNormalData.inc NormalizationTest.txt
+ $(PHP) Utf8Test.php --icu
+ $(PHP) CleanUpTest.php --icu
+ $(PHP) UtfNormalTest.php --icu
+
+icubench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt
+ $(PHP) UtfNormalBench.php --icu
+
clean :
rm -f UtfNormalData.inc
* @return string a clean, shiny, normalized UTF-8 string
*/
function cleanUp( $string ) {
- if( UtfNormal::quickIsNFCVerify( $string ) )
+ if( NORMALIZE_ICU ) {
+ # We exclude a few chars that ICU would not.
+ $string = preg_replace(
+ '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
+ UTF8_REPLACEMENT,
+ $string );
+ $str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+
+ # UnicodeString constructor fails if the string ends with a
+ # head byte. Add a junk char at the end, we'll strip it off.
+ return rtrim( utf8_normalize( $str . "\x01", UNORM_NFC ), "\x01" );
+ } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
+ # Side effect -- $string has had UTF-8 errors cleaned up.
return $string;
- else
+ } else {
return UtfNormal::NFC( $string );
+ }
}
/**