From: Brion Vibber <brion@users.mediawiki.org>
Date: Fri, 5 Nov 2004 00:47:03 +0000 (+0000)
Subject: cleanUp() optimization: about 1/8 speed boost on unicode-dominant text (Japanese... 
X-Git-Tag: 1.5.0alpha1~1400
X-Git-Url: http://git.cyclocoop.org/fichier?a=commitdiff_plain;h=874f8b48c6e10bdad8c2c3da282bbb5bea2f6f17;p=lhc%2Fweb%2Fwiklou.git

cleanUp() optimization: about 1/8 speed boost on unicode-dominant text (Japanese, Korean test data)
---

diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 1a4d0cbb77..283fa73b11 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -269,14 +269,16 @@ class UtfNormal {
 		static $checkit = null, $tailBytes = null;
 		if( !isset( $checkit ) ) {
 			# Head bytes for sequences which we should do further validity checks
-			$checkit = array_flip(
+			$checkit = array_flip( array_map( 'chr',
 					array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
 						   0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
-						   0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) );
+						   0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
 			
 			$tailBytes = array();
-			for( $n = 0xc0; $n < 0xfe; $n++ ) {
-				if( $n < 0xe0 ) {
+			for( $n = 0; $n < 256; $n++ ) {
+				if( $n < 0xc0 ) {
+					$remaining = 0;
+				} elseif( $n < 0xe0 ) {
 					$remaining = 1;
 				} elseif( $n < 0xf0 ) {
 					$remaining = 2;
@@ -286,8 +288,10 @@ class UtfNormal {
 					$remaining = 4;
 				} elseif( $n < 0xfe ) {
 					$remaining = 5;
+				} else {
+					$remaining = 0;
 				}
-				$tailBytes[$n] = $remaining;
+				$tailBytes[chr($n)] = $remaining;
 			}
 		}
 		
@@ -306,13 +310,12 @@ class UtfNormal {
 			}
 			$len = strlen( $str );
 			$tail = false;
-			$head = 0;
+			$head = '';
 			
 			for( $i = 0; $i < $len; $i++ ) {
 				$c = $str{$i};
-				$n = ord( $c );
 				if( $tail ) {
-					if( $n >= 0x80 && $n < 0xc0 ) {
+					if( $c >= "\x80" && $c < "\xc0" ) {
 						$sequence .= $c;
 						if( --$remaining ) {
 							# Keep adding bytes...
@@ -322,6 +325,7 @@ class UtfNormal {
 						if( isset( $checkit[$head] ) ) {
 							# Do some more detailed validity checks, for
 							# invalid characters and illegal sequences.
+							$head = ord( $head );
 							if( ( $head == 0xed && $sequence >= UTF8_SURROGATE_FIRST
 									&& $sequence <= UTF8_SURROGATE_LAST)
 								|| ($head  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
@@ -347,7 +351,7 @@ class UtfNormal {
 						# The sequence is legal!
 						$out .= $sequence;
 						$tail = false;
-						$head = 0;
+						$head = '';
 						continue;
 					}
 					# Not a valid tail byte! DIscard the char we've been building.
@@ -355,19 +359,18 @@ class UtfNormal {
 					$tail = false;
 					$out .= UTF8_REPLACEMENT;
 				}
-				if( $n < 0x80 ) {
+				if( $remaining = $tailBytes[$c] ) {
+					$tail = true;
+					$sequence = $c;
+					$head = $c;
+				} elseif( $c < "\x80" ) {
 					$out .= $c;
-				} elseif( $n < 0xc0 ) {
+				} elseif( $c < "\xc0" ) {
 					# illegal tail bytes or head byte of overlong sequence
-					if( $head == 0 ) {
+					if( $head == '' ) {
 						# Don't add if we're continuing a too-long sequence
 						$out .= UTF8_REPLACEMENT;
 					}
-				} elseif( $n < 0xfe ) {
-					$tail = true;
-					$remaining = $tailBytes[$n];
-					$sequence = $c;
-					$head = $n;
 				} else {
 					$out .= UTF8_REPLACEMENT;
 				}