Add support for using ICU to perform normalization, which is much much faster than...
authorBrion Vibber <brion@users.mediawiki.org>
Thu, 7 Oct 2004 05:59:10 +0000 (05:59 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Thu, 7 Oct 2004 05:59:10 +0000 (05:59 +0000)
Still need to add support for cleanup/verification.

includes/normal/UtfNormal.php
includes/normal/UtfNormalBench.php
includes/normal/UtfNormalTest.php

index 3ea8ef7..110793a 100644 (file)
@@ -92,6 +92,20 @@ define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
 define( 'UTF8_HEAD', false );
 define( 'UTF8_TAIL', true );
 
+
+/**
+ * For using the ICU wrapper
+ */
+define( 'UNORM_NONE', 1 );
+define( 'UNORM_NFD',  2 );
+define( 'UNORM_NFKD', 3 );
+define( 'UNORM_NFC',  4 );
+define( 'UNORM_DEFAULT', UNORM_NFC );
+define( 'UNORM_NFKC', 5 );
+define( 'UNORM_FCD',  6 );
+
+define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
+
 /**
  *
  * @package MediaWiki
@@ -123,7 +137,9 @@ class UtfNormal {
         * @return string a UTF-8 string in normal form C
         */
        function toNFC( $string ) {
-               if( UtfNormal::quickIsNFC( $string ) )
+               if( NORMALIZE_ICU )
+                       return utf8_normalize( $string, UNORM_NFC );
+               elseif( UtfNormal::quickIsNFC( $string ) )
                        return $string;
                else
                        return UtfNormal::NFC( $string );
@@ -137,7 +153,9 @@ class UtfNormal {
         * @return string a UTF-8 string in normal form D
         */
        function toNFD( $string ) {
-               if( preg_match( '/[\x80-\xff]/', $string ) )
+               if( NORMALIZE_ICU )
+                       return utf8_normalize( $string, UNORM_NFD );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
                        return UtfNormal::NFD( $string );
                else
                        return $string;
@@ -152,7 +170,9 @@ class UtfNormal {
         * @return string a UTF-8 string in normal form KC
         */
        function toNFKC( $string ) {
-               if( preg_match( '/[\x80-\xff]/', $string ) )
+               if( NORMALIZE_ICU )
+                       return utf8_normalize( $string, UNORM_NFKC );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
                        return UtfNormal::NFKC( $string );
                else
                        return $string;
@@ -167,7 +187,9 @@ class UtfNormal {
         * @return string a UTF-8 string in normal form KD
         */
        function toNFKD( $string ) {
-               if( preg_match( '/[\x80-\xff]/', $string ) )
+               if( NORMALIZE_ICU )
+                       return utf8_normalize( $string, UNORM_NFKD );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
                        return UtfNormal::NFKD( $string );
                else
                        return $string;
index 2e1740c..d42d592 100644 (file)
  */
 
 /** */
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+       dl( 'php_utfnormal.so' );
+}
+
 require_once 'UtfNormalUtil.php';
 require_once 'UtfNormal.php';
 
index 6360a7c..16992be 100644 (file)
@@ -44,6 +44,10 @@ if( defined( 'PRETTY_UTF8' ) ) {
        }       
 }
 
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+       dl( 'php_utfnormal.so' );
+}
+
 require_once 'UtfNormalUtil.php';
 require_once 'UtfNormal.php';
 
@@ -106,7 +110,8 @@ while( false !== ($line = fgets( $in ) ) ) {
        $cols = explode( ';', $line );
        $char = codepointToUtf8( hexdec( $cols[0] ) );
        $desc = $cols[0] . ": " . $cols[1];
-       if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+       if( $char === "\x00" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+               # Can't check NULL with the ICU plugin, as null bytes fail in C land.
                # Surrogates are illegal on their own or in UTF-8, ignore.
                continue;
        }