From 9ba6a6c74a1b48936045e6b7902ba1bfd7feb32b Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Fri, 5 Nov 2004 00:26:09 +0000
Subject: [PATCH] cleanUp() optimization: split the string into pure ASCII
 chunks and chunks which need to be checked byte by byte. Over 5x speedup for
 German text sample.

---
 includes/normal/UtfNormal.php | 141 +++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 69 deletions(-)

diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 0737823ee1..1a4d0cbb77 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -291,88 +291,91 @@ class UtfNormal {
 			}
 		}
 		
-		$len = strlen( $string );
+		# Chop the text into pure-ASCII and non-ASCII areas;
+		# large ASCII parts can be handled much more quickly.
+		# Don't chop up for little newlines or spaces, though,
+		# that wastes energy.
+		preg_match_all( '/([\x00-\x7f]+|[\x80-\xff][\x0a\x20\x80-\xff]*)/', $string, $matches );
+		
 		$out = '';
-		$tail = false;
 		$looksNormal = true;
-		$head = 0;
-		
-		for( $i = 0; $i < $len; $i++ ) {
-			$c = $string{$i};
-			$n = ord( $c );
-			if( $tail ) {
-				if( $n >= 0x80 && $n < 0xc0 ) {
-					$sequence .= $c;
-					if( --$remaining ) {
-						# Keep adding bytes...
-						continue;
-					}
-
-					if( isset( $checkit[$head] ) ) {
-						# Do some more detailed validity checks, for
-						# invalid characters and illegal sequences.
-						if( ( $head == 0xed && $sequence >= UTF8_SURROGATE_FIRST
-								&& $sequence <= UTF8_SURROGATE_LAST)
-							|| ($head  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
-							|| ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
-							|| ($head == 0xef && 
-								($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
-								|| ($sequence == UTF8_FFFE)
-								|| ($sequence == UTF8_FFFF) )
-							|| ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
-							|| ($head >= 0xf0 && $sequence > UTF8_MAX) ) {
-							$out .= UTF8_REPLACEMENT;
-							$tail = false;
+		foreach( $matches[1] as $str ) {
+			if( $str{0} < "\x80" ) {
+				$out .= $str;
+				continue;
+			}
+			$len = strlen( $str );
+			$tail = false;
+			$head = 0;
+			
+			for( $i = 0; $i < $len; $i++ ) {
+				$c = $str{$i};
+				$n = ord( $c );
+				if( $tail ) {
+					if( $n >= 0x80 && $n < 0xc0 ) {
+						$sequence .= $c;
+						if( --$remaining ) {
+							# Keep adding bytes...
 							continue;
 						}
+	
+						if( isset( $checkit[$head] ) ) {
+							# Do some more detailed validity checks, for
+							# invalid characters and illegal sequences.
+							if( ( $head == 0xed && $sequence >= UTF8_SURROGATE_FIRST
+									&& $sequence <= UTF8_SURROGATE_LAST)
+								|| ($head  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
+								|| ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
+								|| ($head == 0xef && 
+									($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
+									|| ($sequence == UTF8_FFFE)
+									|| ($sequence == UTF8_FFFF) )
+								|| ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
+								|| ($head >= 0xf0 && $sequence > UTF8_MAX) ) {
+								$out .= UTF8_REPLACEMENT;
+								$tail = false;
+								continue;
+							}
+						}
+						
+						if( isset( $utfCheckNFC[$sequence] ) ||
+							isset( $utfCombiningClass[$sequence] ) ) {
+							# If it's NO or MAYBE, we'll have to do the slow check.
+							$looksNormal = false;
+						}
+						
+						# The sequence is legal!
+						$out .= $sequence;
+						$tail = false;
+						$head = 0;
+						continue;
 					}
-					
-					if( isset( $utfCheckNFC[$sequence] ) ||
-						isset( $utfCombiningClass[$sequence] ) ) {
-						# If it's NO or MAYBE, we'll have to do the slow check.
-						$looksNormal = false;
-					}
-					
-					# The sequence is legal!
-					$out .= $sequence;
+					# Not a valid tail byte! DIscard the char we've been building.
+					#printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
 					$tail = false;
-					$head = 0;
-					continue;
+					$out .= UTF8_REPLACEMENT;
 				}
-				# Not a valid tail byte! DIscard the char we've been building.
-				#printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
-				$tail = false;
-				$out .= UTF8_REPLACEMENT;
-			}
-			if( $n < 0x80 ) {
-				# Friendly ASCII chars.
-				# We can speed things up a bit for latin-based scripts
-				# where they tend to come in groups:
-				$out .= $c;
-				$i++;
-				while( $i < $len && ( $c = $string{$i} ) < "\x80" ) {
+				if( $n < 0x80 ) {
 					$out .= $c;
-					$i++;
-				}
-				$i--;
-			} elseif( $n < 0xc0 ) {
-				# illegal tail bytes or head byte of overlong sequence
-				if( $head == 0 ) {
-					# Don't add if we're continuing a too-long sequence
+				} elseif( $n < 0xc0 ) {
+					# illegal tail bytes or head byte of overlong sequence
+					if( $head == 0 ) {
+						# Don't add if we're continuing a too-long sequence
+						$out .= UTF8_REPLACEMENT;
+					}
+				} elseif( $n < 0xfe ) {
+					$tail = true;
+					$remaining = $tailBytes[$n];
+					$sequence = $c;
+					$head = $n;
+				} else {
 					$out .= UTF8_REPLACEMENT;
 				}
-			} elseif( $n < 0xfe ) {
-				$tail = true;
-				$remaining = $tailBytes[$n];
-				$sequence = $c;
-				$head = $n;
-			} else {
+			}
+			if( $tail ) {
 				$out .= UTF8_REPLACEMENT;
 			}
 		}
-		if( $tail ) {
-			$out .= UTF8_REPLACEMENT;
-		}
 		$string = $out;
 		return $looksNormal;
 	}
-- 
2.20.1