From: Brion Vibber Date: Fri, 5 Nov 2004 04:07:04 +0000 (+0000) Subject: More incremental optimization on cleanUp(): X-Git-Tag: 1.5.0alpha1~1399 X-Git-Url: https://git.cyclocoop.org/%28%28?a=commitdiff_plain;h=0db79dbed68b1c30065ed19e4a52200efb8b12c7;p=lhc%2Fweb%2Fwiklou.git More incremental optimization on cleanUp(): * when splitting ascii vs non-ascii chunks, don't split punctuation and control chars as aggressively; this benefits the Korean test data * use output buffer and echo; it's _slightly_ faster than string concatenation. * Separate the surrogate check from the others; many Korean letters fall in the adjacent area with the same head byte, so this gives a small speed boost on Korean text --- diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 283fa73b11..73a7d6243c 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -261,6 +261,8 @@ class UtfNormal { preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string ); # ASCII is always valid NFC! + # If we're only ever given plain ASCII, we can avoid the overhead + # of initializing the decomposition tables by skipping out early. if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; UtfNormal::loadData(); @@ -297,59 +299,81 @@ class UtfNormal { # Chop the text into pure-ASCII and non-ASCII areas; # large ASCII parts can be handled much more quickly. - # Don't chop up for little newlines or spaces, though, + # Don't chop up Unicode areas for punctuation, though, # that wastes energy. - preg_match_all( '/([\x00-\x7f]+|[\x80-\xff][\x0a\x20\x80-\xff]*)/', $string, $matches ); + preg_match_all( + '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', + $string, $matches ); - $out = ''; + ob_start(); $looksNormal = true; foreach( $matches[1] as $str ) { if( $str{0} < "\x80" ) { - $out .= $str; + # ASCII chunk: guaranteed to be valid UTF-8 + # and in normal form C, so output it quick. + echo $str; continue; } + + # We'll have to examine the chunk byte by byte to ensure + # that it consists of valid UTF-8 sequences, and to see + # if any of them might not be normalized. + # + # Since PHP is not the fastest language on earth, some of + # this code is a little ugly with inner loop optimizations. + $len = strlen( $str ); $tail = false; $head = ''; - for( $i = 0; $i < $len; $i++ ) { - $c = $str{$i}; if( $tail ) { - if( $c >= "\x80" && $c < "\xc0" ) { + if( ( $c = $str{$i} ) >= "\x80" && $c < "\xc0" ) { $sequence .= $c; if( --$remaining ) { # Keep adding bytes... continue; } - + + # We have come to the end of the sequence... if( isset( $checkit[$head] ) ) { # Do some more detailed validity checks, for # invalid characters and illegal sequences. - $head = ord( $head ); - if( ( $head == 0xed && $sequence >= UTF8_SURROGATE_FIRST - && $sequence <= UTF8_SURROGATE_LAST) - || ($head < 0xc2 && $sequence <= UTF8_OVERLONG_A) - || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B) - || ($head == 0xef && - ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF) - || ($sequence == UTF8_FFFE) - || ($sequence == UTF8_FFFF) ) - || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C) - || ($head >= 0xf0 && $sequence > UTF8_MAX) ) { - $out .= UTF8_REPLACEMENT; - $tail = false; - continue; + if( $head == "\xed" ) { + # 0xed is relatively frequent in Korean, which + # abuts the surrogate area, so we're doing + # this check separately. + if( $sequence >= UTF8_SURROGATE_FIRST ) { + echo UTF8_REPLACEMENT; + $tail = false; + continue; + } + } else { + $n = ord( $head ); + if( ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A) + || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B) + || ($n == 0xef && + ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF) + || ($sequence == UTF8_FFFE) + || ($sequence == UTF8_FFFF) ) + || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C) + || ($n >= 0xf0 && $sequence > UTF8_MAX) ) { + echo UTF8_REPLACEMENT; + $tail = false; + continue; + } } } if( isset( $utfCheckNFC[$sequence] ) || isset( $utfCombiningClass[$sequence] ) ) { - # If it's NO or MAYBE, we'll have to do the slow check. + # If it's NO or MAYBE, we'll have to rip + # the string apart and put it back together. + # That's going to be mighty slow. $looksNormal = false; } # The sequence is legal! - $out .= $sequence; + echo $sequence; $tail = false; $head = ''; continue; @@ -357,29 +381,29 @@ class UtfNormal { # Not a valid tail byte! DIscard the char we've been building. #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining ); $tail = false; - $out .= UTF8_REPLACEMENT; + echo UTF8_REPLACEMENT; } - if( $remaining = $tailBytes[$c] ) { + if( $remaining = $tailBytes[$c = $str{$i}] ) { $tail = true; - $sequence = $c; - $head = $c; + $sequence = $head = $c; } elseif( $c < "\x80" ) { - $out .= $c; + echo $c; } elseif( $c < "\xc0" ) { # illegal tail bytes or head byte of overlong sequence if( $head == '' ) { # Don't add if we're continuing a too-long sequence - $out .= UTF8_REPLACEMENT; + echo UTF8_REPLACEMENT; } } else { - $out .= UTF8_REPLACEMENT; + echo UTF8_REPLACEMENT; } } if( $tail ) { - $out .= UTF8_REPLACEMENT; + echo UTF8_REPLACEMENT; } } - $string = $out; + $string = ob_get_contents(); + ob_end_clean(); return $looksNormal; }