Add support for using ICU to perform normalization, which is much much faster than...

author Brion Vibber <brion@users.mediawiki.org>

Thu, 7 Oct 2004 05:59:10 +0000 (05:59 +0000)

committer Brion Vibber <brion@users.mediawiki.org>

Thu, 7 Oct 2004 05:59:10 +0000 (05:59 +0000)
author Brion Vibber <brion@users.mediawiki.org>
Thu, 7 Oct 2004 05:59:10 +0000 (05:59 +0000)
committer Brion Vibber <brion@users.mediawiki.org>
Thu, 7 Oct 2004 05:59:10 +0000 (05:59 +0000)
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php

index 3ea8ef7..110793a 100644 (file)
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -92,6 +92,20 @@ define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
  define( 'UTF8_HEAD', false );
  define( 'UTF8_TAIL', true );
  
+
+/**
+ * For using the ICU wrapper
+ */
+define( 'UNORM_NONE', 1 );
+define( 'UNORM_NFD',  2 );
+define( 'UNORM_NFKD', 3 );
+define( 'UNORM_NFC',  4 );
+define( 'UNORM_DEFAULT', UNORM_NFC );
+define( 'UNORM_NFKC', 5 );
+define( 'UNORM_FCD',  6 );
+
+define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
+
  /**
   *
   * @package MediaWiki
@@ -123,7 +137,9 @@ class UtfNormal {
          * @return string a UTF-8 string in normal form C
          */
         function toNFC( $string ) {
-               if( UtfNormal::quickIsNFC( $string ) )
+               if( NORMALIZE_ICU )
+                       return utf8_normalize( $string, UNORM_NFC );
+               elseif( UtfNormal::quickIsNFC( $string ) )
                         return $string;
                 else
                         return UtfNormal::NFC( $string );
@@ -137,7 +153,9 @@ class UtfNormal {
          * @return string a UTF-8 string in normal form D
          */
         function toNFD( $string ) {
-               if( preg_match( '/[\x80-\xff]/', $string ) )
+               if( NORMALIZE_ICU )
+                       return utf8_normalize( $string, UNORM_NFD );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
                         return UtfNormal::NFD( $string );
                 else
                         return $string;
@@ -152,7 +170,9 @@ class UtfNormal {
          * @return string a UTF-8 string in normal form KC
          */
         function toNFKC( $string ) {
-               if( preg_match( '/[\x80-\xff]/', $string ) )
+               if( NORMALIZE_ICU )
+                       return utf8_normalize( $string, UNORM_NFKC );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
                         return UtfNormal::NFKC( $string );
                 else
                         return $string;
@@ -167,7 +187,9 @@ class UtfNormal {
          * @return string a UTF-8 string in normal form KD
          */
         function toNFKD( $string ) {
-               if( preg_match( '/[\x80-\xff]/', $string ) )
+               if( NORMALIZE_ICU )
+                       return utf8_normalize( $string, UNORM_NFKD );
+               elseif( preg_match( '/[\x80-\xff]/', $string ) )
                         return UtfNormal::NFKD( $string );
                 else
                         return $string;
diff --git a/includes/normal/UtfNormalBench.php b/includes/normal/UtfNormalBench.php

index 2e1740c..d42d592 100644 (file)
--- a/includes/normal/UtfNormalBench.php
+++ b/includes/normal/UtfNormalBench.php
@@ -23,6 +23,10 @@
   */
  
  /** */
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+       dl( 'php_utfnormal.so' );
+}
+
  require_once 'UtfNormalUtil.php';
  require_once 'UtfNormal.php';
  
diff --git a/includes/normal/UtfNormalTest.php b/includes/normal/UtfNormalTest.php

index 6360a7c..16992be 100644 (file)
--- a/includes/normal/UtfNormalTest.php
+++ b/includes/normal/UtfNormalTest.php
@@ -44,6 +44,10 @@ if( defined( 'PRETTY_UTF8' ) ) {
         }       
  }
  
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+       dl( 'php_utfnormal.so' );
+}
+
  require_once 'UtfNormalUtil.php';
  require_once 'UtfNormal.php';
  
@@ -106,7 +110,8 @@ while( false !== ($line = fgets( $in ) ) ) {
         $cols = explode( ';', $line );
         $char = codepointToUtf8( hexdec( $cols[0] ) );
         $desc = $cols[0] . ": " . $cols[1];
-       if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+       if( $char === "\x00" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+               # Can't check NULL with the ICU plugin, as null bytes fail in C land.
                 # Surrogates are illegal on their own or in UTF-8, ignore.
                 continue;
         }
author	Brion Vibber <brion@users.mediawiki.org>
	Thu, 7 Oct 2004 05:59:10 +0000 (05:59 +0000)
committer	Brion Vibber <brion@users.mediawiki.org>
	Thu, 7 Oct 2004 05:59:10 +0000 (05:59 +0000)
includes/normal/UtfNormal.php		patch \| blob \| history
includes/normal/UtfNormalBench.php		patch \| blob \| history
includes/normal/UtfNormalTest.php		patch \| blob \| history