From e4e75a58a6eace77977adbaa1c9199be6f432d7f Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Sun, 14 Nov 2004 05:17:29 +0000
Subject: [PATCH] Support using ICU to do most of the heavy lifting in
 cleanUp() if the extension is loaded. Modestly faster for roman text (1-2x),
 16-20x faster than the PHP looping for already normalized Russian, Japanese,
 and Korean text.

---
 includes/normal/CleanUpTest.php | 14 ++++++++++++++
 includes/normal/Makefile        | 15 ++++++++++++++-
 includes/normal/UtfNormal.php   | 17 +++++++++++++++--
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/includes/normal/CleanUpTest.php b/includes/normal/CleanUpTest.php
index 0eb4058819..badade6d1e 100644
--- a/includes/normal/CleanUpTest.php
+++ b/includes/normal/CleanUpTest.php
@@ -316,6 +316,20 @@ class CleanUpTest extends PHPUnit_TestCase {
 			bin2hex( $expect ),
 			bin2hex( UtfNormal::cleanUp( $text ) ) );
 	}
+	
+	function testBomRegression() {
+		$text   = "\xef\xbf\xbe" . # U+FFFE, illegal char
+		          "\xb2" . # bad tail
+		          "\xef" . # bad head
+		          "\x59";
+		$expect = "\xef\xbf\xbd" .
+		          "\xef\xbf\xbd" .
+		          "\xef\xbf\xbd" .
+		          "\x59";
+		$this->assertEquals(
+			bin2hex( $expect ),
+			bin2hex( UtfNormal::cleanUp( $text ) ) );
+	}
 }
 
 
diff --git a/includes/normal/Makefile b/includes/normal/Makefile
index 1042e12711..fcdf238073 100644
--- a/includes/normal/Makefile
+++ b/includes/normal/Makefile
@@ -1,3 +1,5 @@
+.PHONY : all test testutf8 testclean icutest bench icubench clean distclean
+
 FETCH=wget
 #FETCH=fetch
 BASE=http://www.unicode.org/Public/UNIDATA
@@ -9,15 +11,26 @@ all : UtfNormalData.inc
 UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
 	$(PHP) UtfNormalGenerate.php
 
-test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
+test : testutf8 testclean UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
 	$(PHP) UtfNormalTest.php
 
 testutf8 : Utf8Test.php UTF-8-test.txt
 	$(PHP) Utf8Test.php
 
+testclean : CleanUpTest.php
+	$(PHP) CleanUpTest.php
+
 bench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt
 	$(PHP) UtfNormalBench.php
 
+icutest : UtfNormalData.inc NormalizationTest.txt
+	$(PHP) Utf8Test.php --icu
+	$(PHP) CleanUpTest.php --icu
+	$(PHP) UtfNormalTest.php --icu
+
+icubench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/sociology.txt testdata/bulgakov.txt
+	$(PHP) UtfNormalBench.php --icu
+
 clean :
 	rm -f UtfNormalData.inc
 
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 254e9c0472..a4c095c904 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -126,10 +126,23 @@ class UtfNormal {
 	 * @return string a clean, shiny, normalized UTF-8 string
 	 */
 	function cleanUp( $string ) {
-		if( UtfNormal::quickIsNFCVerify( $string ) )
+		if( NORMALIZE_ICU ) {
+			# We exclude a few chars that ICU would not.
+			$string = preg_replace(
+				'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
+				UTF8_REPLACEMENT,
+				$string );
+			$str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+			
+			# UnicodeString constructor fails if the string ends with a
+			# head byte. Add a junk char at the end, we'll strip it off.
+			return rtrim( utf8_normalize( $str . "\x01", UNORM_NFC ), "\x01" );
+		} elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
+			# Side effect -- $string has had UTF-8 errors cleaned up.
 			return $string;
-		else
+		} else {
 			return UtfNormal::NFC( $string );
+		}
 	}
 
 	/**
-- 
2.20.1