From: Brion Vibber <brion@users.mediawiki.org>
Date: Fri, 5 Nov 2004 04:07:04 +0000 (+0000)
Subject: More incremental optimization on cleanUp():
X-Git-Tag: 1.5.0alpha1~1399
X-Git-Url: https://git.cyclocoop.org/%28%28?a=commitdiff_plain;h=0db79dbed68b1c30065ed19e4a52200efb8b12c7;p=lhc%2Fweb%2Fwiklou.git

More incremental optimization on cleanUp():
* when splitting ascii vs non-ascii chunks, don't split punctuation and control chars as aggressively; this benefits the Korean test data
* use output buffer and echo; it's _slightly_ faster than string concatenation.
* Separate the surrogate check from the others; many Korean letters fall in the adjacent area with the same head byte, so this gives a small speed boost on Korean text
---

diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 283fa73b11..73a7d6243c 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -261,6 +261,8 @@ class UtfNormal {
 		preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
 		
 		# ASCII is always valid NFC!
+		# If we're only ever given plain ASCII, we can avoid the overhead
+		# of initializing the decomposition tables by skipping out early.
 		if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 		
 		UtfNormal::loadData();
@@ -297,59 +299,81 @@ class UtfNormal {
 		
 		# Chop the text into pure-ASCII and non-ASCII areas;
 		# large ASCII parts can be handled much more quickly.
-		# Don't chop up for little newlines or spaces, though,
+		# Don't chop up Unicode areas for punctuation, though,
 		# that wastes energy.
-		preg_match_all( '/([\x00-\x7f]+|[\x80-\xff][\x0a\x20\x80-\xff]*)/', $string, $matches );
+		preg_match_all(
+			'/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
+			$string, $matches );
 		
-		$out = '';
+		ob_start();
 		$looksNormal = true;
 		foreach( $matches[1] as $str ) {
 			if( $str{0} < "\x80" ) {
-				$out .= $str;
+				# ASCII chunk: guaranteed to be valid UTF-8
+				# and in normal form C, so output it quick.
+				echo $str;
 				continue;
 			}
+			
+			# We'll have to examine the chunk byte by byte to ensure
+			# that it consists of valid UTF-8 sequences, and to see
+			# if any of them might not be normalized.
+			#
+			# Since PHP is not the fastest language on earth, some of
+			# this code is a little ugly with inner loop optimizations.
+			
 			$len = strlen( $str );
 			$tail = false;
 			$head = '';
-			
 			for( $i = 0; $i < $len; $i++ ) {
-				$c = $str{$i};
 				if( $tail ) {
-					if( $c >= "\x80" && $c < "\xc0" ) {
+					if( ( $c = $str{$i} ) >= "\x80" && $c < "\xc0" ) {
 						$sequence .= $c;
 						if( --$remaining ) {
 							# Keep adding bytes...
 							continue;
 						}
-	
+						
+						# We have come to the end of the sequence...
 						if( isset( $checkit[$head] ) ) {
 							# Do some more detailed validity checks, for
 							# invalid characters and illegal sequences.
-							$head = ord( $head );
-							if( ( $head == 0xed && $sequence >= UTF8_SURROGATE_FIRST
-									&& $sequence <= UTF8_SURROGATE_LAST)
-								|| ($head  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
-								|| ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
-								|| ($head == 0xef && 
-									($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
-									|| ($sequence == UTF8_FFFE)
-									|| ($sequence == UTF8_FFFF) )
-								|| ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
-								|| ($head >= 0xf0 && $sequence > UTF8_MAX) ) {
-								$out .= UTF8_REPLACEMENT;
-								$tail = false;
-								continue;
+							if( $head == "\xed" ) {
+								# 0xed is relatively frequent in Korean, which
+								# abuts the surrogate area, so we're doing
+								# this check separately.
+								if( $sequence >= UTF8_SURROGATE_FIRST ) {
+									echo UTF8_REPLACEMENT;
+									$tail = false;
+									continue;
+								}
+							} else {
+								$n = ord( $head );
+								if(    ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
+									|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
+									|| ($n == 0xef && 
+										($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
+										|| ($sequence == UTF8_FFFE)
+										|| ($sequence == UTF8_FFFF) )
+									|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
+									|| ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
+									echo UTF8_REPLACEMENT;
+									$tail = false;
+									continue;
+								}
 							}
 						}
 						
 						if( isset( $utfCheckNFC[$sequence] ) ||
 							isset( $utfCombiningClass[$sequence] ) ) {
-							# If it's NO or MAYBE, we'll have to do the slow check.
+							# If it's NO or MAYBE, we'll have to rip
+							# the string apart and put it back together.
+							# That's going to be mighty slow.
 							$looksNormal = false;
 						}
 						
 						# The sequence is legal!
-						$out .= $sequence;
+						echo $sequence;
 						$tail = false;
 						$head = '';
 						continue;
@@ -357,29 +381,29 @@ class UtfNormal {
 					# Not a valid tail byte! DIscard the char we've been building.
 					#printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
 					$tail = false;
-					$out .= UTF8_REPLACEMENT;
+					echo UTF8_REPLACEMENT;
 				}
-				if( $remaining = $tailBytes[$c] ) {
+				if( $remaining = $tailBytes[$c = $str{$i}] ) {
 					$tail = true;
-					$sequence = $c;
-					$head = $c;
+					$sequence = $head = $c;
 				} elseif( $c < "\x80" ) {
-					$out .= $c;
+					echo $c;
 				} elseif( $c < "\xc0" ) {
 					# illegal tail bytes or head byte of overlong sequence
 					if( $head == '' ) {
 						# Don't add if we're continuing a too-long sequence
-						$out .= UTF8_REPLACEMENT;
+						echo UTF8_REPLACEMENT;
 					}
 				} else {
-					$out .= UTF8_REPLACEMENT;
+					echo UTF8_REPLACEMENT;
 				}
 			}
 			if( $tail ) {
-				$out .= UTF8_REPLACEMENT;
+				echo UTF8_REPLACEMENT;
 			}
 		}
-		$string = $out;
+		$string = ob_get_contents();
+		ob_end_clean();
 		return $looksNormal;
 	}