UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
$(PHP) UtfNormalGenerate.php
-test : UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
+test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
$(PHP) UtfNormalTest.php
+testutf8 : Utf8Test.php UTF-8-test.txt
+ $(PHP) Utf8Test.php
+
bench : UtfNormalData.inc
$(PHP) UtfNormalBench.php
UnicodeData.txt :
$(FETCH) $(BASE)/UnicodeData.txt
+UTF-8-test.txt :
+ $(FETCH) http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
-This directory contains some Unicode normalization routines.
+This directory contains some Unicode normalization routines. These routines
+are meant to be reusable in other projects, so I'm not tying them to the
+MediaWiki utility functions.
The main function to care about is UtfNormal::toNFC(); this will convert
a given UTF-8 string to Normalization Form C if it's not already such.
The function assumes that the input string is already valid UTF-8; if there
are corrupt characters this may produce erroneous results.
+To also check for illegal characters, use UtfNormal::cleanUp(). This will
+strip illegal UTF-8 sequences and characters that are illegal in XML, and
+if necessary convert to normalization form C.
+
Performance is kind of stinky in absolute terms, though it should be speedy
on pure ASCII text. ;) On text that can be determined quickly to already be
in NFC it's not too awful but it can quickly get uncomfortably slow,
== Regenerating data tables ==
-UtfNormalData.inc is generated from the Unicode Character Database by
-the script UtfNormalGenerate.php. On a *nix system 'make' should fetch the
-necessary files and regenerate it if the scripts have been changed or you
-remove it.
+UtfNormalData.inc and UtfNormalDataK.inc are generated from the Unicode
+Character Database by the script UtfNormalGenerate.php. On a *nix system
+'make' should fetch the necessary files and regenerate it if the scripts
+have been changed or you remove it.
== Testing ==
--- /dev/null
+<?php
+# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+# Runs the UTF-8 decoder test at:
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+
+require_once 'UtfNormalUtil.php';
+require_once 'UtfNormal.php';
+mb_internal_encoding( "utf-8" );
+
+#$verbose = true;
+if( php_sapi_name() != 'cli' ) {
+ die( "Run me from the command line please.\n" );
+}
+
+$in = fopen( "UTF-8-test.txt", "rt" );
+if( !$in ) {
+ print "Couldn't open UTF-8-test.txt -- can't run tests.\n";
+ print "If necessary, manually download this file. It can be obtained at\n";
+ print "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt";
+ exit(-1);
+}
+
+$columns = 0;
+while( false !== ( $line = fgets( $in ) ) ) {
+ if( preg_match( '/^(Here come the tests:\s*)\|$/', $line, $matches ) ) {
+ $columns = strpos( $line, '|' );
+ break;
+ }
+}
+
+if( !$columns ) {
+ print "Something seems to be wrong; couldn't extract line length.\n";
+ print "Check that UTF-8-test.txt was downloaded correctly from\n";
+ print "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt";
+ exit(-1);
+}
+
+# print "$columns\n";
+
+$ignore = array(
+ # These two lines actually seem to be corrupt
+ '2.1.1', '2.2.1' );
+
+$exceptions = array(
+ # Tests that should mark invalid characters due to using long
+ # sequences beyond what is now considered legal.
+ '2.1.5', '2.1.6', '2.2.4', '2.2.5', '2.2.6', '2.3.5',
+
+ # Literal 0xffff, which is illegal
+ '2.2.3' );
+
+$longTests = array(
+ # These tests span multiple lines
+ '3.1.9', '3.2.1', '3.2.2', '3.2.3', '3.2.4', '3.2.5',
+ '3.4' );
+
+# These tests are not in proper subsections
+$sectionTests = array( '3.4' );
+
+$section = NULL;
+$test = '';
+$failed = 0;
+$success = 0;
+$total = 0;
+while( false !== ( $line = fgets( $in ) ) ) {
+ if( preg_match( '/^(\d+)\s+(.*?)\s*\|/', $line, $matches ) ) {
+ $section = $matches[1];
+ print $line;
+ continue;
+ }
+ if( preg_match( '/^(\d+\.\d+\.\d+)\s*/', $line, $matches ) ) {
+ $test = $matches[1];
+
+ if( in_array( $test, $ignore ) ) {
+ continue;
+ }
+ if( in_array( $test, $longTests ) ) {
+ $line = fgets( $in );
+ for( $line = fgets( $in ); !preg_match( '/^\s+\|/', $line ); $line = fgets( $in ) ) {
+ testLine( $test, $line, $total, $success, $failed );
+ }
+ } else {
+ testLine( $test, $line, $total, $success, $failed );
+ }
+ }
+}
+
+if( $failed ) {
+ echo "\nFailed $failed tests.\n";
+ echo "UTF-8 DECODER TEST FAILED\n";
+ exit (-1);
+}
+
+echo "UTF-8 DECODER TEST SUCCESS!\n";
+exit (0);
+
+
+function testLine( $test, $line, &$total, &$success, &$failed ) {
+ $stripped = $line;
+ UtfNormal::quickisNFCVerify( $stripped );
+
+ $same = ( $line == $stripped );
+ $len = mb_strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
+ if( $len == 0 ) {
+ $len = strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
+ }
+
+ global $columns;
+ $ok = $same ^ ($test >= 3 );
+
+ global $exceptions;
+ $ok ^= in_array( $test, $exceptions );
+
+ $ok &= ($columns == $len);
+
+ $total++;
+ if( $ok ) {
+ $success++;
+ } else {
+ $failed++;
+ }
+ global $verbose;
+ if( $verbose || !$ok ) {
+ print str_replace( "\n", "$len\n", $stripped );
+ }
+}
+
+?>
\ No newline at end of file
define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
+define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
+define( 'UNICODE_SURROGATE_LAST', 0xdfff );
+define( 'UNICODE_MAX', 0x10ffff );
+define( 'UNICODE_REPLACEMENT', 0xfffd );
+
+
define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
+define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
+define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
+define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
+define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
+#define( 'UTF8_REPLACEMENT', '!' );
+
+define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
+define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
+define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
+
+# These two ranges are illegal
+define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
+define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
+define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
+define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
+
+define( 'UTF8_HEAD', false );
+define( 'UTF8_TAIL', true );
+
+
class UtfNormal {
+ # The ultimate convenience function! Clean up invalid UTF-8 sequences,
+ # and convert to normal form C. Faster on pure ASCII strings, or
+ # secondarily on strings which are already definitely normalized.
+ function cleanUp( $string ) {
+ if( UtfNormal::quickIsNFCVerify( $string ) )
+ return $string;
+ else
+ return UtfNormal::NFC( $string );
+ }
+
# These functions try to skip the conversion if it won't be necessary.
# An all ASCII string for instance doesn't need conversion.
function toNFC( $string ) {
return $string;
}
-
+ # Returns true if the string is _definitely_ in NFC.
+ # Returns false if not or uncertain.
function quickIsNFC( $string ) {
# ASCII is always valid NFC!
- if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+ # If it's pure ASCII and doesn't contain any XML-forbidden chars, let it through.
+ if( !preg_match( '/[\x00-\x08\x0b\x0c\x0f-\x1f\x80-\xff]/', $string ) ) return true;
global $utfCheckNFC, $utfCombiningClass;
$len = strlen( $string );
}
return true;
}
+
+ # As above, but also *alter the string* to strip invalid UTF-8 sequences.
+ function quickIsNFCVerify( &$string ) {
+ # ASCII is always valid NFC!
+ if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+
+ global $utfCheckNFC, $utfCombiningClass;
+ $len = strlen( $string );
+ $out = '';
+ $state = UTF8_HEAD;
+ $looksNormal = true;
+
+ $rep = false;
+ $head = 0;
+ for( $i = 0; $i < $len; $i++ ) {
+ $c = $string{$i};
+ $n = ord( $c );
+ if( $state == UTF8_TAIL ) {
+ if( $n >= 0x80 && $n < 0xc0 ) {
+ $sequence .= $c;
+ if( --$remaining == 0 ) {
+ if( ($sequence >= UTF8_SURROGATE_FIRST
+ && $sequence <= UTF8_SURROGATE_LAST)
+ || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
+ || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
+ || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
+ || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
+ || ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
+ || ($sequence == UTF8_FFFE)
+ || ($sequence == UTF8_FFFF)
+ || ($sequence > UTF8_MAX) ) {
+ $out .= UTF8_REPLACEMENT;
+ $state = UTF8_HEAD;
+ continue;
+ }
+ if( isset( $utfCheckNFC[$sequence] ) ||
+ isset( $utfCombiningClass[$sequence] ) ) {
+ # If it's NO or MAYBE, we'll have to do the slow check.
+ $looksNormal = false;
+ }
+ $out .= $sequence;
+ $state = UTF8_HEAD;
+ $head = 0;
+ }
+ continue;
+ }
+ # Not a valid tail byte! DIscard the char we've been building.
+ #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
+ $state = UTF8_HEAD;
+ $out .= UTF8_REPLACEMENT;
+ }
+ if( $n < 0x09 ) {
+ $out .= UTF8_REPLACEMENT;
+ } elseif( $n < 0x0b ) {
+ $out .= $c;
+ } elseif( $n == 0x0c ) {
+ # Strip \r
+ } elseif( $n < 0x20 ) {
+ $out .= UTF8_REPLACEMENT;
+ } elseif( $n < 0x80 ) {
+ $out .= $c;
+ } elseif( $n < 0xc0 ) {
+ # illegal tail bytes or head byte of overlong sequence
+ if( $head == 0 ) $out .= UTF8_REPLACEMENT;
+ } elseif( $n < 0xe0 ) {
+ $state = UTF8_TAIL;
+ $remaining = 1;
+ $sequence = $c;
+ $head = $n;
+ } elseif( $n < 0xf0 ) {
+ $state = UTF8_TAIL;
+ $remaining = 2;
+ $sequence = $c;
+ $head = $n;
+ } elseif( $n < 0xf8 ) {
+ $state = UTF8_TAIL;
+ $remaining = 3;
+ $sequence = $c;
+ $head = $n;
+ } elseif( $n < 0xfc ) {
+ $state = UTF8_TAIL;
+ $remaining = 4;
+ $sequence = $c;
+ $head = $n;
+ } elseif( $n < 0xfe ) {
+ $state = UTF8_TAIL;
+ $remaining = 5;
+ $sequence = $c;
+ $head = $n;
+ } else {
+ $out .= UTF8_REPLACEMENT;
+ }
+ }
+ if( $state == UTF8_TAIL ) {
+ $out .= UTF8_REPLACEMENT;
+ }
+ $string = $out;
+ return $looksNormal;
+ }
# These take a string and run the normalization on them, without
# checking for validity or any optimization etc. Input must be
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
+define( 'BENCH_CYCLES', 3 );
+
if( php_sapi_name() != 'cli' ) {
die( "Run me from the command line please.\n" );
}
print "Testing $filename ($desc)...\n";
$data = file_get_contents( $filename );
$forms = array( 'placebo',
- 'fastDecompose', 'fastCombiningSort', 'fastCompose',
- 'toNFD', 'toNFKD', 'toNFC', 'toNFKC',
- 'NFD', 'NFKD', 'NFC', 'NFKC' );
+ 'cleanUp',
+ 'toNFC',
+# 'toNFKC',
+# 'toNFD', 'toNFKD',
+ 'NFC',
+# 'NFKC',
+# 'NFD', 'NFKD',
+# 'fastDecompose', 'fastCombiningSort', 'fastCompose',
+ 'quickIsNFC', 'quickIsNFCVerify',
+ );
foreach( $forms as $form ) {
benchmarkForm( $u, $data, $form );
}
function benchmarkForm( &$u, &$data, $form ) {
global $utfCanonicalDecomp;
$start = benchTime();
- $out = $u->$form( $data, $utfCanonicalDecomp );
- $delta = benchTime() - $start;
+ for( $i = 0; $i < BENCH_CYCLES; $i++ ) {
+ $out = $u->$form( $data, $utfCanonicalDecomp );
+ }
+ $delta = (benchTime() - $start) / BENCH_CYCLES;
+ $rate = IntVal( strlen( $data ) / $delta );
$same = (0 == strcmp( $data, $out ) );
- printf( " %4s %1.4fs (%s)\n", $form, $delta, ($same ? 'no change' : 'changed' ) );
+ printf( " %20s %1.4fs %8d bytes/s (%s)\n", $form, $delta, $rate, ($same ? 'no change' : 'changed' ) );
}
?>
\ No newline at end of file
$cols = explode( ';', $line );
$char = codepointToUtf8( hexdec( $cols[0] ) );
$desc = $cols[0] . ": " . $cols[1];
+ if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+ # Surrogates are illegal on their own or in UTF-8, ignore.
+ continue;
+ }
if( empty( $testedChars[$char] ) ) {
$total++;
if( testInvariant( $normalizer, $char, $desc ) ) {