Add UtfNormal::cleanUp() function: strips XML-unsafe characters and illegal UTF-8...
authorBrion Vibber <brion@users.mediawiki.org>
Fri, 3 Sep 2004 05:39:30 +0000 (05:39 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Fri, 3 Sep 2004 05:39:30 +0000 (05:39 +0000)
includes/normal/Makefile
includes/normal/README
includes/normal/Utf8Test.php [new file with mode: 0644]
includes/normal/UtfNormal.php
includes/normal/UtfNormalBench.php
includes/normal/UtfNormalTest.php

index dbb1c70..0443560 100644 (file)
@@ -9,9 +9,12 @@ all : UtfNormalData.inc
 UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
        $(PHP) UtfNormalGenerate.php
 
-test : UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
+test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
        $(PHP) UtfNormalTest.php
 
+testutf8 : Utf8Test.php UTF-8-test.txt
+       $(PHP) Utf8Test.php
+
 bench : UtfNormalData.inc
        $(PHP) UtfNormalBench.php
 
@@ -37,3 +40,5 @@ DerivedNormalizationProps.txt :
 UnicodeData.txt :
        $(FETCH) $(BASE)/UnicodeData.txt
 
+UTF-8-test.txt :
+       $(FETCH) http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
index 26248b4..3424507 100644 (file)
@@ -1,10 +1,16 @@
-This directory contains some Unicode normalization routines.
+This directory contains some Unicode normalization routines. These routines
+are meant to be reusable in other projects, so I'm not tying them to the
+MediaWiki utility functions.
 
 The main function to care about is UtfNormal::toNFC(); this will convert
 a given UTF-8 string to Normalization Form C if it's not already such.
 The function assumes that the input string is already valid UTF-8; if there
 are corrupt characters this may produce erroneous results.
 
+To also check for illegal characters, use UtfNormal::cleanUp(). This will
+strip illegal UTF-8 sequences and characters that are illegal in XML, and
+if necessary convert to normalization form C.
+
 Performance is kind of stinky in absolute terms, though it should be speedy
 on pure ASCII text. ;) On text that can be determined quickly to already be
 in NFC it's not too awful but it can quickly get uncomfortably slow,
@@ -14,10 +20,10 @@ extra slow).
 
 == Regenerating data tables ==
 
-UtfNormalData.inc is generated from the Unicode Character Database by
-the script UtfNormalGenerate.php. On a *nix system 'make' should fetch the
-necessary files and regenerate it if the scripts have been changed or you
-remove it.
+UtfNormalData.inc and UtfNormalDataK.inc are generated from the Unicode
+Character Database by the script UtfNormalGenerate.php. On a *nix system
+'make' should fetch the necessary files and regenerate it if the scripts
+have been changed or you remove it.
 
 
 == Testing ==
diff --git a/includes/normal/Utf8Test.php b/includes/normal/Utf8Test.php
new file mode 100644 (file)
index 0000000..c3ab249
--- /dev/null
@@ -0,0 +1,145 @@
+<?php
+# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or 
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+# Runs the UTF-8 decoder test at:
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+
+require_once 'UtfNormalUtil.php';
+require_once 'UtfNormal.php';
+mb_internal_encoding( "utf-8" );
+
+#$verbose = true;
+if( php_sapi_name() != 'cli' ) {
+       die( "Run me from the command line please.\n" );
+}
+
+$in = fopen( "UTF-8-test.txt", "rt" );
+if( !$in ) {
+       print "Couldn't open UTF-8-test.txt -- can't run tests.\n";
+       print "If necessary, manually download this file. It can be obtained at\n";
+       print "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt";
+       exit(-1);
+}
+
+$columns = 0;
+while( false !== ( $line = fgets( $in ) ) ) {
+       if( preg_match( '/^(Here come the tests:\s*)\|$/', $line, $matches ) ) {
+               $columns = strpos( $line, '|' );
+               break;
+       }
+}
+
+if( !$columns ) {
+       print "Something seems to be wrong; couldn't extract line length.\n";
+       print "Check that UTF-8-test.txt was downloaded correctly from\n";
+       print "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt";
+       exit(-1);
+}
+
+# print "$columns\n";
+
+$ignore = array(
+       # These two lines actually seem to be corrupt
+       '2.1.1', '2.2.1' );
+
+$exceptions = array(
+       # Tests that should mark invalid characters due to using long
+       # sequences beyond what is now considered legal.
+       '2.1.5', '2.1.6', '2.2.4', '2.2.5', '2.2.6', '2.3.5',
+       
+       # Literal 0xffff, which is illegal
+       '2.2.3' );
+
+$longTests = array(
+       # These tests span multiple lines
+       '3.1.9', '3.2.1', '3.2.2', '3.2.3', '3.2.4', '3.2.5',
+       '3.4' );
+
+# These tests are not in proper subsections
+$sectionTests = array( '3.4' );
+
+$section = NULL;
+$test = '';
+$failed = 0;
+$success = 0;
+$total = 0;
+while( false !== ( $line = fgets( $in ) ) ) {
+       if( preg_match( '/^(\d+)\s+(.*?)\s*\|/', $line, $matches ) ) {
+               $section = $matches[1];
+               print $line;
+               continue;
+       }
+       if( preg_match( '/^(\d+\.\d+\.\d+)\s*/', $line, $matches ) ) {
+               $test = $matches[1];
+
+               if( in_array( $test, $ignore ) ) {
+                       continue;
+               }
+               if( in_array( $test, $longTests ) ) {
+                       $line = fgets( $in );
+                       for( $line = fgets( $in ); !preg_match( '/^\s+\|/', $line ); $line = fgets( $in ) ) {
+                               testLine( $test, $line, $total, $success, $failed );
+                       }
+               } else {
+                       testLine( $test, $line, $total, $success, $failed );
+               }
+       }
+}
+
+if( $failed ) {
+       echo "\nFailed $failed tests.\n";
+       echo "UTF-8 DECODER TEST FAILED\n";
+       exit (-1);
+}
+
+echo "UTF-8 DECODER TEST SUCCESS!\n";
+exit (0);
+
+
+function testLine( $test, $line, &$total, &$success, &$failed ) {
+       $stripped = $line;
+       UtfNormal::quickisNFCVerify( $stripped );
+
+       $same = ( $line == $stripped );
+       $len = mb_strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
+       if( $len == 0 ) {
+               $len = strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
+       }
+       
+       global $columns;
+       $ok = $same ^ ($test >= 3 );
+
+       global $exceptions;
+       $ok ^= in_array( $test, $exceptions );
+       
+       $ok &= ($columns == $len);
+       
+       $total++;
+       if( $ok ) {
+               $success++;
+       } else {
+               $failed++;
+       }
+       global $verbose;
+       if( $verbose || !$ok ) {
+               print str_replace( "\n", "$len\n", $stripped );
+       }
+}
+
+?>
\ No newline at end of file
index 6d30ea3..3ccae67 100644 (file)
@@ -51,6 +51,12 @@ define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1
 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
 
+define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
+define( 'UNICODE_SURROGATE_LAST', 0xdfff );
+define( 'UNICODE_MAX', 0x10ffff );
+define( 'UNICODE_REPLACEMENT', 0xfffd );
+
+
 define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
 define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
 
@@ -62,7 +68,37 @@ define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
 define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
 define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
 
+define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
+define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
+define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
+define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
+#define( 'UTF8_REPLACEMENT', '!' );
+
+define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
+define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
+define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
+
+# These two ranges are illegal
+define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
+define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
+define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
+define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
+
+define( 'UTF8_HEAD', false );
+define( 'UTF8_TAIL', true );
+
+
 class UtfNormal {
+       # The ultimate convenience function! Clean up invalid UTF-8 sequences,
+       # and convert to normal form C. Faster on pure ASCII strings, or
+       # secondarily on strings which are already definitely normalized.
+       function cleanUp( $string ) {
+               if( UtfNormal::quickIsNFCVerify( $string ) )
+                       return $string;
+               else
+                       return UtfNormal::NFC( $string );
+       }
+
        # These functions try to skip the conversion if it won't be necessary.
        # An all ASCII string for instance doesn't need conversion.
        function toNFC( $string ) {
@@ -93,10 +129,12 @@ class UtfNormal {
                        return $string;
        }
        
-       
+       # Returns true if the string is _definitely_ in NFC.
+       # Returns false if not or uncertain.
        function quickIsNFC( $string ) {
                # ASCII is always valid NFC!
-               if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+               # If it's pure ASCII and doesn't contain any XML-forbidden chars, let it through.
+               if( !preg_match( '/[\x00-\x08\x0b\x0c\x0f-\x1f\x80-\xff]/', $string ) ) return true;
                
                global $utfCheckNFC, $utfCombiningClass;
                $len = strlen( $string );
@@ -126,6 +164,105 @@ class UtfNormal {
                }
                return true;
        }
+
+       # As above, but also *alter the string* to strip invalid UTF-8 sequences.
+       function quickIsNFCVerify( &$string ) {
+               # ASCII is always valid NFC!
+               if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+               
+               global $utfCheckNFC, $utfCombiningClass;
+               $len = strlen( $string );
+               $out = '';
+               $state = UTF8_HEAD;
+               $looksNormal = true;
+               
+               $rep = false;
+               $head = 0;
+               for( $i = 0; $i < $len; $i++ ) {
+                       $c = $string{$i};
+                       $n = ord( $c );
+                       if( $state == UTF8_TAIL ) {
+                               if( $n >= 0x80 && $n < 0xc0 ) {
+                                       $sequence .= $c;
+                                       if( --$remaining == 0 ) {
+                                               if( ($sequence >= UTF8_SURROGATE_FIRST
+                                                               && $sequence <= UTF8_SURROGATE_LAST)
+                                                       || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
+                                                       || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
+                                                       || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
+                                                       || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
+                                                       || ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
+                                                       || ($sequence == UTF8_FFFE)
+                                                       || ($sequence == UTF8_FFFF)
+                                                       || ($sequence > UTF8_MAX) ) {
+                                                       $out .= UTF8_REPLACEMENT;
+                                                       $state = UTF8_HEAD;
+                                                       continue;
+                                               }
+                                               if( isset( $utfCheckNFC[$sequence] ) ||
+                                                       isset( $utfCombiningClass[$sequence] ) ) {
+                                                       # If it's NO or MAYBE, we'll have to do the slow check.
+                                                       $looksNormal = false;
+                                               }
+                                               $out .= $sequence;
+                                               $state = UTF8_HEAD;
+                                               $head = 0;
+                                       }
+                                       continue;
+                               }
+                               # Not a valid tail byte! DIscard the char we've been building.
+                               #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
+                               $state = UTF8_HEAD;
+                               $out .= UTF8_REPLACEMENT;
+                       }
+                       if( $n < 0x09 ) {
+                               $out .= UTF8_REPLACEMENT;
+                       } elseif( $n < 0x0b ) {
+                               $out .= $c;
+                       } elseif( $n == 0x0c ) {
+                               # Strip \r
+                       } elseif( $n < 0x20 ) {
+                               $out .= UTF8_REPLACEMENT;
+                       } elseif( $n < 0x80 ) {
+                               $out .= $c;
+                       } elseif( $n < 0xc0 ) {
+                               # illegal tail bytes or head byte of overlong sequence
+                               if( $head == 0 ) $out .= UTF8_REPLACEMENT;
+                       } elseif( $n < 0xe0 ) {
+                               $state = UTF8_TAIL;
+                               $remaining = 1;
+                               $sequence = $c;
+                               $head = $n;
+                       } elseif( $n < 0xf0 ) {
+                               $state = UTF8_TAIL;
+                               $remaining = 2;
+                               $sequence = $c;
+                               $head = $n;
+                       } elseif( $n < 0xf8 ) {
+                               $state = UTF8_TAIL;
+                               $remaining = 3;
+                               $sequence = $c;
+                               $head = $n;
+                       } elseif( $n < 0xfc ) {
+                               $state = UTF8_TAIL;
+                               $remaining = 4;
+                               $sequence = $c;
+                               $head = $n;
+                       } elseif( $n < 0xfe ) {
+                               $state = UTF8_TAIL;
+                               $remaining = 5;
+                               $sequence = $c;
+                               $head = $n;
+                       } else {
+                               $out .= UTF8_REPLACEMENT;
+                       }
+               }
+               if( $state == UTF8_TAIL ) {
+                       $out .= UTF8_REPLACEMENT;
+               }
+               $string = $out;
+               return $looksNormal;
+       }
        
        # These take a string and run the normalization on them, without
        # checking for validity or any optimization etc. Input must be
index 44cc932..dcb83cf 100644 (file)
@@ -20,6 +20,8 @@
 require_once 'UtfNormalUtil.php';
 require_once 'UtfNormal.php';
 
+define( 'BENCH_CYCLES', 3 );
+
 if( php_sapi_name() != 'cli' ) {
        die( "Run me from the command line please.\n" );
 }
@@ -41,9 +43,16 @@ function benchmarkTest( &$u, $filename, $desc ) {
        print "Testing $filename ($desc)...\n";
        $data = file_get_contents( $filename );
        $forms = array( 'placebo',
-               'fastDecompose', 'fastCombiningSort', 'fastCompose',
-               'toNFD', 'toNFKD', 'toNFC', 'toNFKC',
-               'NFD', 'NFKD', 'NFC', 'NFKC' );
+               'cleanUp',
+               'toNFC',
+#              'toNFKC',
+#              'toNFD', 'toNFKD',
+               'NFC',
+#              'NFKC',
+#              'NFD', 'NFKD',
+#              'fastDecompose', 'fastCombiningSort', 'fastCompose',
+               'quickIsNFC', 'quickIsNFCVerify',
+               );
        foreach( $forms as $form ) {
                benchmarkForm( $u, $data, $form );
        }
@@ -57,11 +66,14 @@ function benchTime(){
 function benchmarkForm( &$u, &$data, $form ) {
        global $utfCanonicalDecomp;
        $start = benchTime();
-       $out = $u->$form( $data, $utfCanonicalDecomp );
-       $delta = benchTime() - $start;
+       for( $i = 0; $i < BENCH_CYCLES; $i++ ) {
+               $out = $u->$form( $data, $utfCanonicalDecomp );
+       }
+       $delta = (benchTime() - $start) / BENCH_CYCLES;
+       $rate = IntVal( strlen( $data ) / $delta );
        $same = (0 == strcmp( $data, $out ) );
        
-       printf( " %4s %1.4fs (%s)\n", $form, $delta, ($same ? 'no change' : 'changed' ) );
+       printf( " %20s %1.4fs %8d bytes/s (%s)\n", $form, $delta, $rate, ($same ? 'no change' : 'changed' ) );
 }
 
 ?>
\ No newline at end of file
index b585c4e..5a37edf 100644 (file)
@@ -99,6 +99,10 @@ while( false !== ($line = fgets( $in ) ) ) {
        $cols = explode( ';', $line );
        $char = codepointToUtf8( hexdec( $cols[0] ) );
        $desc = $cols[0] . ": " . $cols[1];
+       if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+               # Surrogates are illegal on their own or in UTF-8, ignore.
+               continue;
+       }
        if( empty( $testedChars[$char] ) ) {
                $total++;
                if( testInvariant( $normalizer, $char, $desc ) ) {