From 5f530ba1f35519861cd710ca472b4b822f291d35 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Thu, 4 Nov 2004 11:44:45 +0000 Subject: [PATCH] Optimize inner loop in cleanUp(): boosts performance on non-ASCII text by about 20%. Also, trim the XML-illegal control characters from pure ASCII as well as non-ASCII strings. --- includes/normal/UtfNormal.php | 141 ++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 68 deletions(-) diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 5e31294a64..08bc17bc17 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -255,109 +255,114 @@ class UtfNormal { * Returns true if the string is _definitely_ in NFC. * Returns false if not or uncertain. * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML. - * @return bool */ function quickIsNFCVerify( &$string ) { + # Screen out some characters that eg won't be allowed in XML + preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string ); + # ASCII is always valid NFC! if( !preg_match( '/[\x80-\xff]/', $string ) ) return true; UtfNormal::loadData(); global $utfCheckNFC, $utfCombiningClass; + + static $checkit = null, $tailBytes = null; + if( !isset( $checkit ) ) { + # Head bytes for sequences which we should do further validity checks + $checkit = array_flip( + array( 0xc0, 0xc1, 0xe0, 0xed, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ); + + $tailBytes = array(); + for( $n = 0xc0; $n < 0xfe; $n++ ) { + if( $n < 0xe0 ) { + $remaining = 1; + } elseif( $n < 0xf0 ) { + $remaining = 2; + } elseif( $n < 0xf8 ) { + $remaining = 3; + } elseif( $n < 0xfc ) { + $remaining = 4; + } elseif( $n < 0xfe ) { + $remaining = 5; + } + $tailBytes[$n] = $remaining; + } + } + $len = strlen( $string ); $out = ''; - $state = UTF8_HEAD; + $tail = false; $looksNormal = true; - - $rep = false; $head = 0; + for( $i = 0; $i < $len; $i++ ) { $c = $string{$i}; $n = ord( $c ); - if( $state == UTF8_TAIL ) { + if( $tail ) { if( $n >= 0x80 && $n < 0xc0 ) { $sequence .= $c; - if( --$remaining == 0 ) { - if( $head < 0xc2 || $head == 0xed || $head == 0xe0 || $head > 0xee ) { - if( ( $sequence >= UTF8_SURROGATE_FIRST - && $sequence <= UTF8_SURROGATE_LAST) - || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A) - || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A) - || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B) - || ($head == 0xef && - ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF) - || ($sequence == UTF8_FFFE) - || ($sequence == UTF8_FFFF) ) - || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C) - || ($sequence > UTF8_MAX) ) { - $out .= UTF8_REPLACEMENT; - $state = UTF8_HEAD; - continue; - } - } - if( isset( $utfCheckNFC[$sequence] ) || - isset( $utfCombiningClass[$sequence] ) ) { - # If it's NO or MAYBE, we'll have to do the slow check. - $looksNormal = false; + if( --$remaining ) { + # Keep adding bytes... + continue; + } + + if( isset( $checkit[$head] ) ) { + # Do some more detailed validity checks, for + # invalid characters and illegal sequences. + if( ( $head == 0xed && $sequence >= UTF8_SURROGATE_FIRST + && $sequence <= UTF8_SURROGATE_LAST) + || ($head < 0xc2 && $sequence <= UTF8_OVERLONG_A) + || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B) + || ($head == 0xef && + ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF) + || ($sequence == UTF8_FFFE) + || ($sequence == UTF8_FFFF) ) + || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C) + || ($head >= 0xf0 && $sequence > UTF8_MAX) ) { + $out .= UTF8_REPLACEMENT; + $tail = false; + continue; } - $out .= $sequence; - $state = UTF8_HEAD; - $head = 0; } + + if( isset( $utfCheckNFC[$sequence] ) || + isset( $utfCombiningClass[$sequence] ) ) { + # If it's NO or MAYBE, we'll have to do the slow check. + $looksNormal = false; + } + + # The sequence is legal! + $out .= $sequence; + $tail = false; + $head = 0; continue; } # Not a valid tail byte! DIscard the char we've been building. #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining ); - $state = UTF8_HEAD; + $tail = false; $out .= UTF8_REPLACEMENT; } - if( $n < 0x20 ) { - if( $n < 0x09 ) { - $out .= UTF8_REPLACEMENT; - } elseif( $n == 0x0a ) { - $out .= $c; - } elseif( $n < 0x0d ) { - $out .= UTF8_REPLACEMENT; - } elseif( $n == 0x0d ) { - # Strip \r silently - } else { - $out .= UTF8_REPLACEMENT; - } - } elseif( $n < 0x80 ) { + if( $n < 0x80 ) { # Friendly ASCII chars. $out .= $c; } elseif( $n < 0xc0 ) { # illegal tail bytes or head byte of overlong sequence - if( $head == 0 ) $out .= UTF8_REPLACEMENT; - } elseif( $n < 0xe0 ) { - $state = UTF8_TAIL; - $remaining = 1; - $sequence = $c; - $head = $n; - } elseif( $n < 0xf0 ) { - $state = UTF8_TAIL; - $remaining = 2; - $sequence = $c; - $head = $n; - } elseif( $n < 0xf8 ) { - $state = UTF8_TAIL; - $remaining = 3; - $sequence = $c; - $head = $n; - } elseif( $n < 0xfc ) { - $state = UTF8_TAIL; - $remaining = 4; - $sequence = $c; - $head = $n; + if( $head == 0 ) { + # Don't add if we're continuing a too-long sequence + $out .= UTF8_REPLACEMENT; + } } elseif( $n < 0xfe ) { - $state = UTF8_TAIL; - $remaining = 5; + $tail = true; + $remaining = $tailBytes[$n]; $sequence = $c; $head = $n; } else { $out .= UTF8_REPLACEMENT; } } - if( $state == UTF8_TAIL ) { + if( $tail ) { $out .= UTF8_REPLACEMENT; } $string = $out; -- 2.20.1