From eae361e2f0d216b247439c1453eaf27e48d8aaf9 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Sun, 7 Nov 2004 11:28:00 +0000
Subject: [PATCH] cleanUp() optimization: speed up Japanese, Korean tests by
 another 15% by rearranging the loop and avoiding rebuilding the string if
 there are no illegal characters. Removed restrictions on U+FDD0 and friends;
 these do seem to be allowed by XML, though they 'recommend' you avoid them.

---
 includes/normal/CleanUpTest.php |  34 +++---
 includes/normal/UtfNormal.php   | 199 ++++++++++++++++++++------------
 2 files changed, 148 insertions(+), 85 deletions(-)

diff --git a/includes/normal/CleanUpTest.php b/includes/normal/CleanUpTest.php
index d165731a90..64f7b63904 100644
--- a/includes/normal/CleanUpTest.php
+++ b/includes/normal/CleanUpTest.php
@@ -53,8 +53,7 @@ class CleanUpTest extends PHPUnit_TestCase {
 			    $i == 0x000a ||
 			    $i == 0x000d ||
 			    ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
-			    ($i > UNICODE_SURROGATE_LAST && $i < 0xfdd0 ) ||
-			    ($i > 0xfdef && $i < 0xfffe ) ||
+			    ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
 			    ($i > 0xffff && $i <= UNICODE_MAX ) ) {
 				if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
 				    $comp = UtfNormal::NFC( $char );
@@ -94,11 +93,14 @@ class CleanUpTest extends PHPUnit_TestCase {
 					bin2hex( $char ), 
 					bin2hex( $clean ),
 					"ASCII byte $x should be intact" );
+				if( $char != $clean ) return;
 			} else {
+				$norm = $head . UTF8_REPLACEMENT . $tail;
 				$this->assertEquals(
-					bin2hex( $head . UTF8_REPLACEMENT . $tail ),
+					bin2hex( $norm ),
 					bin2hex( $clean ),
 					"Forbidden byte $x should be rejected" );
+				if( $norm != $clean ) return;
 			}
 		}
 	}
@@ -119,21 +121,27 @@ class CleanUpTest extends PHPUnit_TestCase {
 				if( $first > 0xc1 &&
 				    $first < 0xe0 &&
 				    $second < 0xc0 ) {
+				    $norm = UtfNormal::NFC( $char );
 					$this->assertEquals(
-						bin2hex( UtfNormal::NFC( $char ) ), 
+						bin2hex( $norm ), 
 						bin2hex( $clean ),
 						"Pair $x should be intact" );
+				    if( $norm != $clean ) return;
 				} elseif( $first > 0xfd || $second > 0xbf ) {
 					# fe and ff are not legal head bytes -- expect two replacement chars
+					$norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
 					$this->assertEquals(
-						bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
+						bin2hex( $norm ),
 						bin2hex( $clean ),
 						"Forbidden pair $x should be rejected" );
+					if( $norm != $clean ) return;
 				} else {
+					$norm = $head . UTF8_REPLACEMENT . $tail;
 					$this->assertEquals(
-						bin2hex( $head . UTF8_REPLACEMENT . $tail ),
+						bin2hex( $norm ),
 						bin2hex( $clean ),
 						"Forbidden pair $x should be rejected" );
+					if( $norm != $clean ) return;
 				}
 			}
 		}
@@ -141,9 +149,9 @@ class CleanUpTest extends PHPUnit_TestCase {
 
 	function testTripleBytes() {
 		$this->doTestTripleBytes( '', '' );
-		#$this->doTestTripleBytes( 'x', '' );
-		#$this->doTestTripleBytes( '', 'x' );
-		#$this->doTestTripleBytes( 'x', 'x' );
+		$this->doTestTripleBytes( 'x', '' );
+		$this->doTestTripleBytes( '', 'x' );
+		$this->doTestTripleBytes( 'x', 'x' );
 	}
 	
 	function doTestTripleBytes( $head, $tail ) {
@@ -160,13 +168,13 @@ class CleanUpTest extends PHPUnit_TestCase {
 						$third < 0xc0 ) {
 						if( $first == 0xe0 && $second < 0xa0 ) {
 							$this->assertEquals(
-								bin2hex( UTF8_REPLACEMENT ), 
+								bin2hex( $head . UTF8_REPLACEMENT . $tail ), 
 								bin2hex( $clean ),
 								"Overlong triplet $x should be rejected" );
 						} elseif( $first == 0xed && 
 							( chr( $first ) . chr( $second ) . chr( $third ))  >= UTF8_SURROGATE_FIRST ) {
 							$this->assertEquals(
-								bin2hex( UTF8_REPLACEMENT ), 
+								bin2hex( $head . UTF8_REPLACEMENT . $tail ), 
 								bin2hex( $clean ),
 								"Surrogate triplet $x should be rejected" );
 						} else {
@@ -177,12 +185,12 @@ class CleanUpTest extends PHPUnit_TestCase {
 						}
 					} elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
 						$this->assertEquals(
-							bin2hex( $head . UtfNormal::NFC( chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
+							bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
 							bin2hex( $clean ),
 							"Valid 2-byte $x + broken tail" );
 					} elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
 						$this->assertEquals(
-							bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) ) . $tail ),
+							bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
 							bin2hex( $clean ),
 							"Broken head + valid 2-byte $x" );
 					} elseif( $first > 0xfd && ( ( $second > 0xbf && $third > 0xbf ) || ($second < 0xc0 && $third < 0xc0 ) || ($second > 0xfd ) || ($third > 0xfd) ) ) {
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index b3592908f9..2883342f8e 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -265,11 +265,12 @@ class UtfNormal {
 		# of initializing the decomposition tables by skipping out early.
 		if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 		
-		UtfNormal::loadData();
-		global $utfCheckNFC, $utfCombiningClass;
-		
 		static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
 		if( !isset( $checkit ) ) {
+			# Load/build some scary lookup tables...
+			UtfNormal::loadData();
+			global $utfCheckNFC, $utfCombiningClass;
+			
 			$utfCheckOrCombining = array_merge( $utfCheckNFC, $utfCombiningClass );
 
 			# Head bytes for sequences which we should do further validity checks
@@ -278,6 +279,8 @@ class UtfNormal {
 						   0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 						   0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
 			
+			# Each UTF-8 head byte is followed by a certain
+			# number of tail bytes.
 			$tailBytes = array();
 			for( $n = 0; $n < 256; $n++ ) {
 				if( $n < 0xc0 ) {
@@ -307,13 +310,16 @@ class UtfNormal {
 			'/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
 			$string, $matches );
 		
-		ob_start();
 		$looksNormal = true;
+		$base = 0;
+		$replace = array();
 		foreach( $matches[1] as $str ) {
+			$chunk = strlen( $str );
+			
 			if( $str{0} < "\x80" ) {
 				# ASCII chunk: guaranteed to be valid UTF-8
-				# and in normal form C, so output it quick.
-				echo $str;
+				# and in normal form C, so skip over it.
+				$base += $chunk;
 				continue;
 			}
 			
@@ -324,87 +330,136 @@ class UtfNormal {
 			# Since PHP is not the fastest language on earth, some of
 			# this code is a little ugly with inner loop optimizations.
 			
-			$len = strlen( $str ) + 1;
-			$tail = 0;
 			$head = '';
-			for( $i = 0; --$len; ++$i ) {
-				if( $tail ) {
-					if( ( $c = $str{$i} ) >= "\x80" && $c < "\xc0" ) {
-						$sequence .= $c;
-						if( --$remaining ) {
-							# Keep adding bytes...
-							continue;
-						}
-						
-						# We have come to the end of the sequence...
-						$tail = 0;
-						
-						if( isset( $checkit[$head] ) ) {
-							# Do some more detailed validity checks, for
-							# invalid characters and illegal sequences.
-							if( $head == "\xed" ) {
-								# 0xed is relatively frequent in Korean, which
-								# abuts the surrogate area, so we're doing
-								# this check separately.
-								if( $sequence >= UTF8_SURROGATE_FIRST ) {
-									echo UTF8_REPLACEMENT;
-									continue;
-								}
+			$len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
+			
+			for( $i = -1; --$len; ) {
+				if( $remaining = $tailBytes[$c = $str{++$i}] ) {
+					# UTF-8 head byte!
+					$sequence = $head = $c;
+					do {
+						# Look for the defined number of tail bytes...
+						if( --$len && ( $c = $str{++$i} ) >= "\x80" && $c < "\xc0" ) {
+							# Legal tail bytes are nice.
+							$sequence .= $c;
+						} else {
+							if( 0 == $len ) {
+								# Premature end of string!
+								# Drop a replacement character into output to
+								# represent the invalid UTF-8 sequence.
+								$replace[] = array( UTF8_REPLACEMENT,
+													$base + $i + 1 - strlen( $sequence ),
+													strlen( $sequence ) );
+								$base += $chunk;
+								break 2;
 							} else {
-								$n = ord( $head );
-								if(    ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
-									|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
-									|| ($n == 0xef && 
-										($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
-										|| ($sequence == UTF8_FFFE)
-										|| ($sequence == UTF8_FFFF) )
-									|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
-									|| ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
-									echo UTF8_REPLACEMENT;
-									continue;
-								}
+								# Illegal tail byte; abandon the sequence.
+								$replace[] = array( UTF8_REPLACEMENT,
+													$base + $i - strlen( $sequence ),
+													strlen( $sequence ) );
+								# Back up and reprocess this byte; it may itself
+								# be a legal ASCII or UTF-8 sequence head.
+								--$i;
+								++$len;
+								continue 2;
 							}
 						}
-						
-						if( isset( $utfCheckOrCombining[$sequence] ) ) {
-							# If it's NO or MAYBE, we'll have to rip
-							# the string apart and put it back together.
-							# That's going to be mighty slow.
-							$looksNormal = false;
+					} while( --$remaining );
+
+					if( isset( $checkit[$head] ) ) {
+						# Do some more detailed validity checks, for
+						# invalid characters and illegal sequences.
+						if( $head == "\xed" ) {
+							# 0xed is relatively frequent in Korean, which
+							# abuts the surrogate area, so we're doing
+							# this check separately to speed things up.
+							
+							if( $sequence >= UTF8_SURROGATE_FIRST ) {
+								# Surrogates are legal only in UTF-16 code.
+								# They are totally forbidden here in UTF-8
+								# utopia.
+								$replace[] = array( UTF8_REPLACEMENT,
+								             $base + $i + 1 - strlen( $sequence ),
+								             strlen( $sequence ) );
+								continue;
+							}
+						} else {
+							# Slower, but rarer checks...
+							$n = ord( $head );
+							if(
+								# "Overlong sequences" are those that are syntactically
+								# correct but use more UTF-8 bytes than are necessary to
+								# encode a character. NaÃ¯ve string comparisons can be
+								# tricked into failing to see a match for an ASCII
+								# character, for instance, which can be a security hole
+								# if blacklist checks are being used.
+							       ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
+								|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
+								|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
+								
+								# U+FFFE and U+FFFF are explicitly forbidden in Unicode.
+								|| ($n == 0xef && 
+									   ($sequence == UTF8_FFFE)
+									|| ($sequence == UTF8_FFFF) )
+								
+								# Unicode has been limited to 21 bits; longer
+								# sequences are not allowed.
+								|| ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
+								
+								$replace[] = array( UTF8_REPLACEMENT,
+								                    $base + $i + 1 - strlen( $sequence ), 
+								                    strlen( $sequence ) );
+								continue;
+							}
 						}
-						
-						# The sequence is legal!
-						echo $sequence;
-						$head = '';
-						continue;
 					}
-					# Not a valid tail byte! DIscard the char we've been building.
-					$tail = false;
-					echo UTF8_REPLACEMENT;
-				}
-				if( $remaining = $tailBytes[$c = $str{$i}] ) {
-					$tail = 1;
-					$sequence = $head = $c;
+					
+					if( isset( $utfCheckOrCombining[$sequence] ) ) {
+						# If it's NO or MAYBE, we'll have to rip
+						# the string apart and put it back together.
+						# That's going to be mighty slow.
+						$looksNormal = false;
+					}
+					
+					# The sequence is legal!
+					$head = '';
 				} elseif( $c < "\x80" ) {
-					echo $c;
+					# ASCII byte.
 				} elseif( $c < "\xc0" ) {
-					# illegal tail bytes or head byte of overlong sequence
+					# Illegal tail bytes
 					if( $head == '' ) {
-						# Don't add if we're continuing a too-long sequence
-						echo UTF8_REPLACEMENT;
+						# Out of the blue!
+						$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
+					} else {
+						# Don't add if we're continuing a broken sequence;
+						# we already put a replacement character when we looked
+						# at the broken sequence.
+						$replace[] = array( '', $base + $i, 1 );
 					}
 				} else {
-					echo UTF8_REPLACEMENT;
+					# Miscellaneous freaks.
+					$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
+				}
+			}
+			$base += $chunk;
+		}
+		if( count( $replace ) ) {
+			# There were illegal UTF-8 sequences we need to fix up.
+			$out = '';
+			$last = 0;
+			foreach( $replace as $rep ) {
+				list( $replacement, $start, $length ) = $rep;
+				if( $last < $start ) {
+					$out .= substr( $string, $last, $start - $last );
 				}
+				$out .= $replacement;
+				$last = $start + $length;
 			}
-			if( $tail ) {
-				# We ended the chunk in the middle of a sequence;
-				# that's so not cool.
-				echo UTF8_REPLACEMENT;
+			if( $last < strlen( $string ) ) {
+				$out .= substr( $string, $last );
 			}
+			$string = $out;
 		}
-		$string = ob_get_contents();
-		ob_end_clean();
 		return $looksNormal;
 	}
 	
-- 
2.20.1