From 9ba6a6c74a1b48936045e6b7902ba1bfd7feb32b Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 5 Nov 2004 00:26:09 +0000 Subject: [PATCH] cleanUp() optimization: split the string into pure ASCII chunks and chunks which need to be checked byte by byte. Over 5x speedup for German text sample. --- includes/normal/UtfNormal.php | 141 +++++++++++++++++----------------- 1 file changed, 72 insertions(+), 69 deletions(-) diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 0737823ee1..1a4d0cbb77 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -291,88 +291,91 @@ class UtfNormal { } } - $len = strlen( $string ); + # Chop the text into pure-ASCII and non-ASCII areas; + # large ASCII parts can be handled much more quickly. + # Don't chop up for little newlines or spaces, though, + # that wastes energy. + preg_match_all( '/([\x00-\x7f]+|[\x80-\xff][\x0a\x20\x80-\xff]*)/', $string, $matches ); + $out = ''; - $tail = false; $looksNormal = true; - $head = 0; - - for( $i = 0; $i < $len; $i++ ) { - $c = $string{$i}; - $n = ord( $c ); - if( $tail ) { - if( $n >= 0x80 && $n < 0xc0 ) { - $sequence .= $c; - if( --$remaining ) { - # Keep adding bytes... - continue; - } - - if( isset( $checkit[$head] ) ) { - # Do some more detailed validity checks, for - # invalid characters and illegal sequences. - if( ( $head == 0xed && $sequence >= UTF8_SURROGATE_FIRST - && $sequence <= UTF8_SURROGATE_LAST) - || ($head < 0xc2 && $sequence <= UTF8_OVERLONG_A) - || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B) - || ($head == 0xef && - ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF) - || ($sequence == UTF8_FFFE) - || ($sequence == UTF8_FFFF) ) - || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C) - || ($head >= 0xf0 && $sequence > UTF8_MAX) ) { - $out .= UTF8_REPLACEMENT; - $tail = false; + foreach( $matches[1] as $str ) { + if( $str{0} < "\x80" ) { + $out .= $str; + continue; + } + $len = strlen( $str ); + $tail = false; + $head = 0; + + for( $i = 0; $i < $len; $i++ ) { + $c = $str{$i}; + $n = ord( $c ); + if( $tail ) { + if( $n >= 0x80 && $n < 0xc0 ) { + $sequence .= $c; + if( --$remaining ) { + # Keep adding bytes... continue; } + + if( isset( $checkit[$head] ) ) { + # Do some more detailed validity checks, for + # invalid characters and illegal sequences. + if( ( $head == 0xed && $sequence >= UTF8_SURROGATE_FIRST + && $sequence <= UTF8_SURROGATE_LAST) + || ($head < 0xc2 && $sequence <= UTF8_OVERLONG_A) + || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B) + || ($head == 0xef && + ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF) + || ($sequence == UTF8_FFFE) + || ($sequence == UTF8_FFFF) ) + || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C) + || ($head >= 0xf0 && $sequence > UTF8_MAX) ) { + $out .= UTF8_REPLACEMENT; + $tail = false; + continue; + } + } + + if( isset( $utfCheckNFC[$sequence] ) || + isset( $utfCombiningClass[$sequence] ) ) { + # If it's NO or MAYBE, we'll have to do the slow check. + $looksNormal = false; + } + + # The sequence is legal! + $out .= $sequence; + $tail = false; + $head = 0; + continue; } - - if( isset( $utfCheckNFC[$sequence] ) || - isset( $utfCombiningClass[$sequence] ) ) { - # If it's NO or MAYBE, we'll have to do the slow check. - $looksNormal = false; - } - - # The sequence is legal! - $out .= $sequence; + # Not a valid tail byte! DIscard the char we've been building. + #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining ); $tail = false; - $head = 0; - continue; + $out .= UTF8_REPLACEMENT; } - # Not a valid tail byte! DIscard the char we've been building. - #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining ); - $tail = false; - $out .= UTF8_REPLACEMENT; - } - if( $n < 0x80 ) { - # Friendly ASCII chars. - # We can speed things up a bit for latin-based scripts - # where they tend to come in groups: - $out .= $c; - $i++; - while( $i < $len && ( $c = $string{$i} ) < "\x80" ) { + if( $n < 0x80 ) { $out .= $c; - $i++; - } - $i--; - } elseif( $n < 0xc0 ) { - # illegal tail bytes or head byte of overlong sequence - if( $head == 0 ) { - # Don't add if we're continuing a too-long sequence + } elseif( $n < 0xc0 ) { + # illegal tail bytes or head byte of overlong sequence + if( $head == 0 ) { + # Don't add if we're continuing a too-long sequence + $out .= UTF8_REPLACEMENT; + } + } elseif( $n < 0xfe ) { + $tail = true; + $remaining = $tailBytes[$n]; + $sequence = $c; + $head = $n; + } else { $out .= UTF8_REPLACEMENT; } - } elseif( $n < 0xfe ) { - $tail = true; - $remaining = $tailBytes[$n]; - $sequence = $c; - $head = $n; - } else { + } + if( $tail ) { $out .= UTF8_REPLACEMENT; } } - if( $tail ) { - $out .= UTF8_REPLACEMENT; - } $string = $out; return $looksNormal; } -- 2.20.1