From: Brion Vibber <brion@users.mediawiki.org>
Date: Fri, 3 Sep 2004 05:39:30 +0000 (+0000)
Subject: Add UtfNormal::cleanUp() function: strips XML-unsafe characters and illegal UTF-8... 
X-Git-Tag: 1.5.0alpha1~2153
X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/banques/?a=commitdiff_plain;h=ed46bd50fe1f70131bfa77ca9eb79a6ba539c490;p=lhc%2Fweb%2Fwiklou.git

Add UtfNormal::cleanUp() function: strips XML-unsafe characters and illegal UTF-8 sequences, then normalizes to form C.
---

diff --git a/includes/normal/Makefile b/includes/normal/Makefile
index dbb1c70c06..04435604b7 100644
--- a/includes/normal/Makefile
+++ b/includes/normal/Makefile
@@ -9,9 +9,12 @@ all : UtfNormalData.inc
 UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
 	$(PHP) UtfNormalGenerate.php
 
-test : UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
+test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
 	$(PHP) UtfNormalTest.php
 
+testutf8 : Utf8Test.php UTF-8-test.txt
+	$(PHP) Utf8Test.php
+
 bench : UtfNormalData.inc
 	$(PHP) UtfNormalBench.php
 
@@ -37,3 +40,5 @@ DerivedNormalizationProps.txt :
 UnicodeData.txt :
 	$(FETCH) $(BASE)/UnicodeData.txt
 
+UTF-8-test.txt :
+	$(FETCH) http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
diff --git a/includes/normal/README b/includes/normal/README
index 26248b428e..3424507d6f 100644
--- a/includes/normal/README
+++ b/includes/normal/README
@@ -1,10 +1,16 @@
-This directory contains some Unicode normalization routines.
+This directory contains some Unicode normalization routines. These routines
+are meant to be reusable in other projects, so I'm not tying them to the
+MediaWiki utility functions.
 
 The main function to care about is UtfNormal::toNFC(); this will convert
 a given UTF-8 string to Normalization Form C if it's not already such.
 The function assumes that the input string is already valid UTF-8; if there
 are corrupt characters this may produce erroneous results.
 
+To also check for illegal characters, use UtfNormal::cleanUp(). This will
+strip illegal UTF-8 sequences and characters that are illegal in XML, and
+if necessary convert to normalization form C.
+
 Performance is kind of stinky in absolute terms, though it should be speedy
 on pure ASCII text. ;) On text that can be determined quickly to already be
 in NFC it's not too awful but it can quickly get uncomfortably slow,
@@ -14,10 +20,10 @@ extra slow).
 
 == Regenerating data tables ==
 
-UtfNormalData.inc is generated from the Unicode Character Database by
-the script UtfNormalGenerate.php. On a *nix system 'make' should fetch the
-necessary files and regenerate it if the scripts have been changed or you
-remove it.
+UtfNormalData.inc and UtfNormalDataK.inc are generated from the Unicode
+Character Database by the script UtfNormalGenerate.php. On a *nix system
+'make' should fetch the necessary files and regenerate it if the scripts
+have been changed or you remove it.
 
 
 == Testing ==
diff --git a/includes/normal/Utf8Test.php b/includes/normal/Utf8Test.php
new file mode 100644
index 0000000000..c3ab2498e1
--- /dev/null
+++ b/includes/normal/Utf8Test.php
@@ -0,0 +1,145 @@
+<?php
+# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or 
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+# Runs the UTF-8 decoder test at:
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+
+require_once 'UtfNormalUtil.php';
+require_once 'UtfNormal.php';
+mb_internal_encoding( "utf-8" );
+
+#$verbose = true;
+if( php_sapi_name() != 'cli' ) {
+	die( "Run me from the command line please.\n" );
+}
+
+$in = fopen( "UTF-8-test.txt", "rt" );
+if( !$in ) {
+	print "Couldn't open UTF-8-test.txt -- can't run tests.\n";
+	print "If necessary, manually download this file. It can be obtained at\n";
+	print "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt";
+	exit(-1);
+}
+
+$columns = 0;
+while( false !== ( $line = fgets( $in ) ) ) {
+	if( preg_match( '/^(Here come the tests:\s*)\|$/', $line, $matches ) ) {
+		$columns = strpos( $line, '|' );
+		break;
+	}
+}
+
+if( !$columns ) {
+	print "Something seems to be wrong; couldn't extract line length.\n";
+	print "Check that UTF-8-test.txt was downloaded correctly from\n";
+	print "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt";
+	exit(-1);
+}
+
+# print "$columns\n";
+
+$ignore = array(
+	# These two lines actually seem to be corrupt
+	'2.1.1', '2.2.1' );
+
+$exceptions = array(
+	# Tests that should mark invalid characters due to using long
+	# sequences beyond what is now considered legal.
+	'2.1.5', '2.1.6', '2.2.4', '2.2.5', '2.2.6', '2.3.5',
+	
+	# Literal 0xffff, which is illegal
+	'2.2.3' );
+
+$longTests = array(
+	# These tests span multiple lines
+	'3.1.9', '3.2.1', '3.2.2', '3.2.3', '3.2.4', '3.2.5',
+	'3.4' );
+
+# These tests are not in proper subsections
+$sectionTests = array( '3.4' );
+
+$section = NULL;
+$test = '';
+$failed = 0;
+$success = 0;
+$total = 0;
+while( false !== ( $line = fgets( $in ) ) ) {
+	if( preg_match( '/^(\d+)\s+(.*?)\s*\|/', $line, $matches ) ) {
+		$section = $matches[1];
+		print $line;
+		continue;
+	}
+	if( preg_match( '/^(\d+\.\d+\.\d+)\s*/', $line, $matches ) ) {
+		$test = $matches[1];
+
+		if( in_array( $test, $ignore ) ) {
+			continue;
+		}
+		if( in_array( $test, $longTests ) ) {
+			$line = fgets( $in );
+			for( $line = fgets( $in ); !preg_match( '/^\s+\|/', $line ); $line = fgets( $in ) ) {
+				testLine( $test, $line, $total, $success, $failed );
+			}
+		} else {
+			testLine( $test, $line, $total, $success, $failed );
+		}
+	}
+}
+
+if( $failed ) {
+	echo "\nFailed $failed tests.\n";
+	echo "UTF-8 DECODER TEST FAILED\n";
+	exit (-1);
+}
+
+echo "UTF-8 DECODER TEST SUCCESS!\n";
+exit (0);
+
+
+function testLine( $test, $line, &$total, &$success, &$failed ) {
+	$stripped = $line;
+	UtfNormal::quickisNFCVerify( $stripped );
+
+	$same = ( $line == $stripped );
+	$len = mb_strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
+	if( $len == 0 ) {
+		$len = strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
+	}
+	
+	global $columns;
+	$ok = $same ^ ($test >= 3 );
+
+	global $exceptions;
+	$ok ^= in_array( $test, $exceptions );
+	
+	$ok &= ($columns == $len);
+	
+	$total++;
+	if( $ok ) {
+		$success++;
+	} else {
+		$failed++;
+	}
+	global $verbose;
+	if( $verbose || !$ok ) {
+		print str_replace( "\n", "$len\n", $stripped );
+	}
+}
+
+?>
\ No newline at end of file
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 6d30ea3510..3ccae670d0 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -51,6 +51,12 @@ define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1
 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
 
+define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
+define( 'UNICODE_SURROGATE_LAST', 0xdfff );
+define( 'UNICODE_MAX', 0x10ffff );
+define( 'UNICODE_REPLACEMENT', 0xfffd );
+
+
 define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
 define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
 
@@ -62,7 +68,37 @@ define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
 define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
 define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
 
+define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
+define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
+define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
+define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
+#define( 'UTF8_REPLACEMENT', '!' );
+
+define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
+define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
+define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
+
+# These two ranges are illegal
+define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
+define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
+define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
+define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
+
+define( 'UTF8_HEAD', false );
+define( 'UTF8_TAIL', true );
+
+
 class UtfNormal {
+	# The ultimate convenience function! Clean up invalid UTF-8 sequences,
+	# and convert to normal form C. Faster on pure ASCII strings, or
+	# secondarily on strings which are already definitely normalized.
+	function cleanUp( $string ) {
+		if( UtfNormal::quickIsNFCVerify( $string ) )
+			return $string;
+		else
+			return UtfNormal::NFC( $string );
+	}
+
 	# These functions try to skip the conversion if it won't be necessary.
 	# An all ASCII string for instance doesn't need conversion.
 	function toNFC( $string ) {
@@ -93,10 +129,12 @@ class UtfNormal {
 			return $string;
 	}
 	
-	
+	# Returns true if the string is _definitely_ in NFC.
+	# Returns false if not or uncertain.
 	function quickIsNFC( $string ) {
 		# ASCII is always valid NFC!
-		if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+		# If it's pure ASCII and doesn't contain any XML-forbidden chars, let it through.
+		if( !preg_match( '/[\x00-\x08\x0b\x0c\x0f-\x1f\x80-\xff]/', $string ) ) return true;
 		
 		global $utfCheckNFC, $utfCombiningClass;
 		$len = strlen( $string );
@@ -126,6 +164,105 @@ class UtfNormal {
 		}
 		return true;
 	}
+
+	# As above, but also *alter the string* to strip invalid UTF-8 sequences.
+	function quickIsNFCVerify( &$string ) {
+		# ASCII is always valid NFC!
+		if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
+		
+		global $utfCheckNFC, $utfCombiningClass;
+		$len = strlen( $string );
+		$out = '';
+		$state = UTF8_HEAD;
+		$looksNormal = true;
+		
+		$rep = false;
+		$head = 0;
+		for( $i = 0; $i < $len; $i++ ) {
+			$c = $string{$i};
+			$n = ord( $c );
+			if( $state == UTF8_TAIL ) {
+				if( $n >= 0x80 && $n < 0xc0 ) {
+					$sequence .= $c;
+					if( --$remaining == 0 ) {
+						if( ($sequence >= UTF8_SURROGATE_FIRST
+								&& $sequence <= UTF8_SURROGATE_LAST)
+							|| ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
+							|| ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
+							|| ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
+							|| ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
+							|| ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
+							|| ($sequence == UTF8_FFFE)
+							|| ($sequence == UTF8_FFFF)
+							|| ($sequence > UTF8_MAX) ) {
+							$out .= UTF8_REPLACEMENT;
+							$state = UTF8_HEAD;
+							continue;
+						}
+						if( isset( $utfCheckNFC[$sequence] ) ||
+							isset( $utfCombiningClass[$sequence] ) ) {
+							# If it's NO or MAYBE, we'll have to do the slow check.
+							$looksNormal = false;
+						}
+						$out .= $sequence;
+						$state = UTF8_HEAD;
+						$head = 0;
+					}
+					continue;
+				}
+				# Not a valid tail byte! DIscard the char we've been building.
+				#printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
+				$state = UTF8_HEAD;
+				$out .= UTF8_REPLACEMENT;
+			}
+			if( $n < 0x09 ) {
+				$out .= UTF8_REPLACEMENT;
+			} elseif( $n < 0x0b ) {
+				$out .= $c;
+			} elseif( $n == 0x0c ) {
+				# Strip \r
+			} elseif( $n < 0x20 ) {
+				$out .= UTF8_REPLACEMENT;
+			} elseif( $n < 0x80 ) {
+				$out .= $c;
+			} elseif( $n < 0xc0 ) {
+				# illegal tail bytes or head byte of overlong sequence
+				if( $head == 0 ) $out .= UTF8_REPLACEMENT;
+			} elseif( $n < 0xe0 ) {
+				$state = UTF8_TAIL;
+				$remaining = 1;
+				$sequence = $c;
+				$head = $n;
+			} elseif( $n < 0xf0 ) {
+				$state = UTF8_TAIL;
+				$remaining = 2;
+				$sequence = $c;
+				$head = $n;
+			} elseif( $n < 0xf8 ) {
+				$state = UTF8_TAIL;
+				$remaining = 3;
+				$sequence = $c;
+				$head = $n;
+			} elseif( $n < 0xfc ) {
+				$state = UTF8_TAIL;
+				$remaining = 4;
+				$sequence = $c;
+				$head = $n;
+			} elseif( $n < 0xfe ) {
+				$state = UTF8_TAIL;
+				$remaining = 5;
+				$sequence = $c;
+				$head = $n;
+			} else {
+				$out .= UTF8_REPLACEMENT;
+			}
+		}
+		if( $state == UTF8_TAIL ) {
+			$out .= UTF8_REPLACEMENT;
+		}
+		$string = $out;
+		return $looksNormal;
+	}
 	
 	# These take a string and run the normalization on them, without
 	# checking for validity or any optimization etc. Input must be
diff --git a/includes/normal/UtfNormalBench.php b/includes/normal/UtfNormalBench.php
index 44cc9324de..dcb83cf503 100644
--- a/includes/normal/UtfNormalBench.php
+++ b/includes/normal/UtfNormalBench.php
@@ -20,6 +20,8 @@
 require_once 'UtfNormalUtil.php';
 require_once 'UtfNormal.php';
 
+define( 'BENCH_CYCLES', 3 );
+
 if( php_sapi_name() != 'cli' ) {
 	die( "Run me from the command line please.\n" );
 }
@@ -41,9 +43,16 @@ function benchmarkTest( &$u, $filename, $desc ) {
 	print "Testing $filename ($desc)...\n";
 	$data = file_get_contents( $filename );
 	$forms = array( 'placebo',
-		'fastDecompose', 'fastCombiningSort', 'fastCompose',
-		'toNFD', 'toNFKD', 'toNFC', 'toNFKC',
-		'NFD', 'NFKD', 'NFC', 'NFKC' );
+		'cleanUp',
+		'toNFC',
+#		'toNFKC',
+#		'toNFD', 'toNFKD',
+		'NFC',
+#		'NFKC',
+#		'NFD', 'NFKD',
+#		'fastDecompose', 'fastCombiningSort', 'fastCompose',
+		'quickIsNFC', 'quickIsNFCVerify',
+		);
 	foreach( $forms as $form ) {
 		benchmarkForm( $u, $data, $form );
 	}
@@ -57,11 +66,14 @@ function benchTime(){
 function benchmarkForm( &$u, &$data, $form ) {
 	global $utfCanonicalDecomp;
 	$start = benchTime();
-	$out = $u->$form( $data, $utfCanonicalDecomp );
-	$delta = benchTime() - $start;
+	for( $i = 0; $i < BENCH_CYCLES; $i++ ) {
+		$out = $u->$form( $data, $utfCanonicalDecomp );
+	}
+	$delta = (benchTime() - $start) / BENCH_CYCLES;
+	$rate = IntVal( strlen( $data ) / $delta );
 	$same = (0 == strcmp( $data, $out ) );
 	
-	printf( " %4s %1.4fs (%s)\n", $form, $delta, ($same ? 'no change' : 'changed' ) );
+	printf( " %20s %1.4fs %8d bytes/s (%s)\n", $form, $delta, $rate, ($same ? 'no change' : 'changed' ) );
 }
 
 ?>
\ No newline at end of file
diff --git a/includes/normal/UtfNormalTest.php b/includes/normal/UtfNormalTest.php
index b585c4ec23..5a37edfd33 100644
--- a/includes/normal/UtfNormalTest.php
+++ b/includes/normal/UtfNormalTest.php
@@ -99,6 +99,10 @@ while( false !== ($line = fgets( $in ) ) ) {
 	$cols = explode( ';', $line );
 	$char = codepointToUtf8( hexdec( $cols[0] ) );
 	$desc = $cols[0] . ": " . $cols[1];
+	if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+		# Surrogates are illegal on their own or in UTF-8, ignore.
+		continue;
+	}
 	if( empty( $testedChars[$char] ) ) {
 		$total++;
 		if( testInvariant( $normalizer, $char, $desc ) ) {