define( 'UTF8_HEAD', false );
define( 'UTF8_TAIL', true );
+
+/**
+ * For using the ICU wrapper
+ */
+define( 'UNORM_NONE', 1 );
+define( 'UNORM_NFD', 2 );
+define( 'UNORM_NFKD', 3 );
+define( 'UNORM_NFC', 4 );
+define( 'UNORM_DEFAULT', UNORM_NFC );
+define( 'UNORM_NFKC', 5 );
+define( 'UNORM_FCD', 6 );
+
+define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
+
/**
*
* @package MediaWiki
* @return string a UTF-8 string in normal form C
*/
function toNFC( $string ) {
- if( UtfNormal::quickIsNFC( $string ) )
+ if( NORMALIZE_ICU )
+ return utf8_normalize( $string, UNORM_NFC );
+ elseif( UtfNormal::quickIsNFC( $string ) )
return $string;
else
return UtfNormal::NFC( $string );
* @return string a UTF-8 string in normal form D
*/
function toNFD( $string ) {
- if( preg_match( '/[\x80-\xff]/', $string ) )
+ if( NORMALIZE_ICU )
+ return utf8_normalize( $string, UNORM_NFD );
+ elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFD( $string );
else
return $string;
* @return string a UTF-8 string in normal form KC
*/
function toNFKC( $string ) {
- if( preg_match( '/[\x80-\xff]/', $string ) )
+ if( NORMALIZE_ICU )
+ return utf8_normalize( $string, UNORM_NFKC );
+ elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFKC( $string );
else
return $string;
* @return string a UTF-8 string in normal form KD
*/
function toNFKD( $string ) {
- if( preg_match( '/[\x80-\xff]/', $string ) )
+ if( NORMALIZE_ICU )
+ return utf8_normalize( $string, UNORM_NFKD );
+ elseif( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFKD( $string );
else
return $string;
}
}
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+ dl( 'php_utfnormal.so' );
+}
+
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
$cols = explode( ';', $line );
$char = codepointToUtf8( hexdec( $cols[0] ) );
$desc = $cols[0] . ": " . $cols[1];
- if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+ if( $char === "\x00" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
+ # Can't check NULL with the ICU plugin, as null bytes fail in C land.
# Surrogates are illegal on their own or in UTF-8, ignore.
continue;
}