cleanUp() optimization: speed up Japanese, Korean tests by another 15% by rearranging...
authorBrion Vibber <brion@users.mediawiki.org>
Sun, 7 Nov 2004 11:28:00 +0000 (11:28 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Sun, 7 Nov 2004 11:28:00 +0000 (11:28 +0000)
Removed restrictions on U+FDD0 and friends; these do seem to be allowed by XML, though they 'recommend' you avoid them.

includes/normal/CleanUpTest.php
includes/normal/UtfNormal.php

index d165731..64f7b63 100644 (file)
@@ -53,8 +53,7 @@ class CleanUpTest extends PHPUnit_TestCase {
                            $i == 0x000a ||
                            $i == 0x000d ||
                            ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
-                           ($i > UNICODE_SURROGATE_LAST && $i < 0xfdd0 ) ||
-                           ($i > 0xfdef && $i < 0xfffe ) ||
+                           ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
                            ($i > 0xffff && $i <= UNICODE_MAX ) ) {
                                if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
                                    $comp = UtfNormal::NFC( $char );
@@ -94,11 +93,14 @@ class CleanUpTest extends PHPUnit_TestCase {
                                        bin2hex( $char ), 
                                        bin2hex( $clean ),
                                        "ASCII byte $x should be intact" );
+                               if( $char != $clean ) return;
                        } else {
+                               $norm = $head . UTF8_REPLACEMENT . $tail;
                                $this->assertEquals(
-                                       bin2hex( $head . UTF8_REPLACEMENT . $tail ),
+                                       bin2hex( $norm ),
                                        bin2hex( $clean ),
                                        "Forbidden byte $x should be rejected" );
+                               if( $norm != $clean ) return;
                        }
                }
        }
@@ -119,21 +121,27 @@ class CleanUpTest extends PHPUnit_TestCase {
                                if( $first > 0xc1 &&
                                    $first < 0xe0 &&
                                    $second < 0xc0 ) {
+                                   $norm = UtfNormal::NFC( $char );
                                        $this->assertEquals(
-                                               bin2hex( UtfNormal::NFC( $char ) ), 
+                                               bin2hex( $norm ), 
                                                bin2hex( $clean ),
                                                "Pair $x should be intact" );
+                                   if( $norm != $clean ) return;
                                } elseif( $first > 0xfd || $second > 0xbf ) {
                                        # fe and ff are not legal head bytes -- expect two replacement chars
+                                       $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
                                        $this->assertEquals(
-                                               bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
+                                               bin2hex( $norm ),
                                                bin2hex( $clean ),
                                                "Forbidden pair $x should be rejected" );
+                                       if( $norm != $clean ) return;
                                } else {
+                                       $norm = $head . UTF8_REPLACEMENT . $tail;
                                        $this->assertEquals(
-                                               bin2hex( $head . UTF8_REPLACEMENT . $tail ),
+                                               bin2hex( $norm ),
                                                bin2hex( $clean ),
                                                "Forbidden pair $x should be rejected" );
+                                       if( $norm != $clean ) return;
                                }
                        }
                }
@@ -141,9 +149,9 @@ class CleanUpTest extends PHPUnit_TestCase {
 
        function testTripleBytes() {
                $this->doTestTripleBytes( '', '' );
-               #$this->doTestTripleBytes( 'x', '' );
-               #$this->doTestTripleBytes( '', 'x' );
-               #$this->doTestTripleBytes( 'x', 'x' );
+               $this->doTestTripleBytes( 'x', '' );
+               $this->doTestTripleBytes( '', 'x' );
+               $this->doTestTripleBytes( 'x', 'x' );
        }
        
        function doTestTripleBytes( $head, $tail ) {
@@ -160,13 +168,13 @@ class CleanUpTest extends PHPUnit_TestCase {
                                                $third < 0xc0 ) {
                                                if( $first == 0xe0 && $second < 0xa0 ) {
                                                        $this->assertEquals(
-                                                               bin2hex( UTF8_REPLACEMENT ), 
+                                                               bin2hex( $head . UTF8_REPLACEMENT . $tail ), 
                                                                bin2hex( $clean ),
                                                                "Overlong triplet $x should be rejected" );
                                                } elseif( $first == 0xed && 
                                                        ( chr( $first ) . chr( $second ) . chr( $third ))  >= UTF8_SURROGATE_FIRST ) {
                                                        $this->assertEquals(
-                                                               bin2hex( UTF8_REPLACEMENT ), 
+                                                               bin2hex( $head . UTF8_REPLACEMENT . $tail ), 
                                                                bin2hex( $clean ),
                                                                "Surrogate triplet $x should be rejected" );
                                                } else {
@@ -177,12 +185,12 @@ class CleanUpTest extends PHPUnit_TestCase {
                                                }
                                        } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
                                                $this->assertEquals(
-                                                       bin2hex( $head . UtfNormal::NFC( chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
+                                                       bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
                                                        bin2hex( $clean ),
                                                        "Valid 2-byte $x + broken tail" );
                                        } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
                                                $this->assertEquals(
-                                                       bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) ) . $tail ),
+                                                       bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
                                                        bin2hex( $clean ),
                                                        "Broken head + valid 2-byte $x" );
                                        } elseif( $first > 0xfd && ( ( $second > 0xbf && $third > 0xbf ) || ($second < 0xc0 && $third < 0xc0 ) || ($second > 0xfd ) || ($third > 0xfd) ) ) {
index b359290..2883342 100644 (file)
@@ -265,11 +265,12 @@ class UtfNormal {
                # of initializing the decomposition tables by skipping out early.
                if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
                
-               UtfNormal::loadData();
-               global $utfCheckNFC, $utfCombiningClass;
-               
                static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
                if( !isset( $checkit ) ) {
+                       # Load/build some scary lookup tables...
+                       UtfNormal::loadData();
+                       global $utfCheckNFC, $utfCombiningClass;
+                       
                        $utfCheckOrCombining = array_merge( $utfCheckNFC, $utfCombiningClass );
 
                        # Head bytes for sequences which we should do further validity checks
@@ -278,6 +279,8 @@ class UtfNormal {
                                                   0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
                                                   0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
                        
+                       # Each UTF-8 head byte is followed by a certain
+                       # number of tail bytes.
                        $tailBytes = array();
                        for( $n = 0; $n < 256; $n++ ) {
                                if( $n < 0xc0 ) {
@@ -307,13 +310,16 @@ class UtfNormal {
                        '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
                        $string, $matches );
                
-               ob_start();
                $looksNormal = true;
+               $base = 0;
+               $replace = array();
                foreach( $matches[1] as $str ) {
+                       $chunk = strlen( $str );
+                       
                        if( $str{0} < "\x80" ) {
                                # ASCII chunk: guaranteed to be valid UTF-8
-                               # and in normal form C, so output it quick.
-                               echo $str;
+                               # and in normal form C, so skip over it.
+                               $base += $chunk;
                                continue;
                        }
                        
@@ -324,87 +330,136 @@ class UtfNormal {
                        # Since PHP is not the fastest language on earth, some of
                        # this code is a little ugly with inner loop optimizations.
                        
-                       $len = strlen( $str ) + 1;
-                       $tail = 0;
                        $head = '';
-                       for( $i = 0; --$len; ++$i ) {
-                               if( $tail ) {
-                                       if( ( $c = $str{$i} ) >= "\x80" && $c < "\xc0" ) {
-                                               $sequence .= $c;
-                                               if( --$remaining ) {
-                                                       # Keep adding bytes...
-                                                       continue;
-                                               }
-                                               
-                                               # We have come to the end of the sequence...
-                                               $tail = 0;
-                                               
-                                               if( isset( $checkit[$head] ) ) {
-                                                       # Do some more detailed validity checks, for
-                                                       # invalid characters and illegal sequences.
-                                                       if( $head == "\xed" ) {
-                                                               # 0xed is relatively frequent in Korean, which
-                                                               # abuts the surrogate area, so we're doing
-                                                               # this check separately.
-                                                               if( $sequence >= UTF8_SURROGATE_FIRST ) {
-                                                                       echo UTF8_REPLACEMENT;
-                                                                       continue;
-                                                               }
+                       $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
+                       
+                       for( $i = -1; --$len; ) {
+                               if( $remaining = $tailBytes[$c = $str{++$i}] ) {
+                                       # UTF-8 head byte!
+                                       $sequence = $head = $c;
+                                       do {
+                                               # Look for the defined number of tail bytes...
+                                               if( --$len && ( $c = $str{++$i} ) >= "\x80" && $c < "\xc0" ) {
+                                                       # Legal tail bytes are nice.
+                                                       $sequence .= $c;
+                                               } else {
+                                                       if( 0 == $len ) {
+                                                               # Premature end of string!
+                                                               # Drop a replacement character into output to
+                                                               # represent the invalid UTF-8 sequence.
+                                                               $replace[] = array( UTF8_REPLACEMENT,
+                                                                                                       $base + $i + 1 - strlen( $sequence ),
+                                                                                                       strlen( $sequence ) );
+                                                               $base += $chunk;
+                                                               break 2;
                                                        } else {
-                                                               $n = ord( $head );
-                                                               if(    ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
-                                                                       || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
-                                                                       || ($n == 0xef && 
-                                                                               ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
-                                                                               || ($sequence == UTF8_FFFE)
-                                                                               || ($sequence == UTF8_FFFF) )
-                                                                       || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
-                                                                       || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
-                                                                       echo UTF8_REPLACEMENT;
-                                                                       continue;
-                                                               }
+                                                               # Illegal tail byte; abandon the sequence.
+                                                               $replace[] = array( UTF8_REPLACEMENT,
+                                                                                                       $base + $i - strlen( $sequence ),
+                                                                                                       strlen( $sequence ) );
+                                                               # Back up and reprocess this byte; it may itself
+                                                               # be a legal ASCII or UTF-8 sequence head.
+                                                               --$i;
+                                                               ++$len;
+                                                               continue 2;
                                                        }
                                                }
-                                               
-                                               if( isset( $utfCheckOrCombining[$sequence] ) ) {
-                                                       # If it's NO or MAYBE, we'll have to rip
-                                                       # the string apart and put it back together.
-                                                       # That's going to be mighty slow.
-                                                       $looksNormal = false;
+                                       } while( --$remaining );
+
+                                       if( isset( $checkit[$head] ) ) {
+                                               # Do some more detailed validity checks, for
+                                               # invalid characters and illegal sequences.
+                                               if( $head == "\xed" ) {
+                                                       # 0xed is relatively frequent in Korean, which
+                                                       # abuts the surrogate area, so we're doing
+                                                       # this check separately to speed things up.
+                                                       
+                                                       if( $sequence >= UTF8_SURROGATE_FIRST ) {
+                                                               # Surrogates are legal only in UTF-16 code.
+                                                               # They are totally forbidden here in UTF-8
+                                                               # utopia.
+                                                               $replace[] = array( UTF8_REPLACEMENT,
+                                                                            $base + $i + 1 - strlen( $sequence ),
+                                                                            strlen( $sequence ) );
+                                                               continue;
+                                                       }
+                                               } else {
+                                                       # Slower, but rarer checks...
+                                                       $n = ord( $head );
+                                                       if(
+                                                               # "Overlong sequences" are those that are syntactically
+                                                               # correct but use more UTF-8 bytes than are necessary to
+                                                               # encode a character. Naïve string comparisons can be
+                                                               # tricked into failing to see a match for an ASCII
+                                                               # character, for instance, which can be a security hole
+                                                               # if blacklist checks are being used.
+                                                              ($n  < 0xc2 && $sequence <= UTF8_OVERLONG_A)
+                                                               || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
+                                                               || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
+                                                               
+                                                               # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
+                                                               || ($n == 0xef && 
+                                                                          ($sequence == UTF8_FFFE)
+                                                                       || ($sequence == UTF8_FFFF) )
+                                                               
+                                                               # Unicode has been limited to 21 bits; longer
+                                                               # sequences are not allowed.
+                                                               || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
+                                                               
+                                                               $replace[] = array( UTF8_REPLACEMENT,
+                                                                                   $base + $i + 1 - strlen( $sequence ), 
+                                                                                   strlen( $sequence ) );
+                                                               continue;
+                                                       }
                                                }
-                                               
-                                               # The sequence is legal!
-                                               echo $sequence;
-                                               $head = '';
-                                               continue;
                                        }
-                                       # Not a valid tail byte! DIscard the char we've been building.
-                                       $tail = false;
-                                       echo UTF8_REPLACEMENT;
-                               }
-                               if( $remaining = $tailBytes[$c = $str{$i}] ) {
-                                       $tail = 1;
-                                       $sequence = $head = $c;
+                                       
+                                       if( isset( $utfCheckOrCombining[$sequence] ) ) {
+                                               # If it's NO or MAYBE, we'll have to rip
+                                               # the string apart and put it back together.
+                                               # That's going to be mighty slow.
+                                               $looksNormal = false;
+                                       }
+                                       
+                                       # The sequence is legal!
+                                       $head = '';
                                } elseif( $c < "\x80" ) {
-                                       echo $c;
+                                       # ASCII byte.
                                } elseif( $c < "\xc0" ) {
-                                       # illegal tail bytes or head byte of overlong sequence
+                                       # Illegal tail bytes
                                        if( $head == '' ) {
-                                               # Don't add if we're continuing a too-long sequence
-                                               echo UTF8_REPLACEMENT;
+                                               # Out of the blue!
+                                               $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
+                                       } else {
+                                               # Don't add if we're continuing a broken sequence;
+                                               # we already put a replacement character when we looked
+                                               # at the broken sequence.
+                                               $replace[] = array( '', $base + $i, 1 );
                                        }
                                } else {
-                                       echo UTF8_REPLACEMENT;
+                                       # Miscellaneous freaks.
+                                       $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
+                               }
+                       }
+                       $base += $chunk;
+               }
+               if( count( $replace ) ) {
+                       # There were illegal UTF-8 sequences we need to fix up.
+                       $out = '';
+                       $last = 0;
+                       foreach( $replace as $rep ) {
+                               list( $replacement, $start, $length ) = $rep;
+                               if( $last < $start ) {
+                                       $out .= substr( $string, $last, $start - $last );
                                }
+                               $out .= $replacement;
+                               $last = $start + $length;
                        }
-                       if( $tail ) {
-                               # We ended the chunk in the middle of a sequence;
-                               # that's so not cool.
-                               echo UTF8_REPLACEMENT;
+                       if( $last < strlen( $string ) ) {
+                               $out .= substr( $string, $last );
                        }
+                       $string = $out;
                }
-               $string = ob_get_contents();
-               ob_end_clean();
                return $looksNormal;
        }