includes/normal/UtfNormal.php

   1 <?php
   2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
   3 # http://www.mediawiki.org/
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along
  16 # with this program; if not, write to the Free Software Foundation, Inc.,
  17 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18 # http://www.gnu.org/copyleft/gpl.html
  19
  20 /**
  21  * Unicode normalization routines for working with UTF-8 strings.
  22  * Currently assumes that input strings are valid UTF-8!
  23  *
  24  * Not as fast as I'd like, but should be usable for most purposes.
  25  * UtfNormal::toNFC() will bail early if given ASCII text or text
  26  * it can quickly deterimine is already normalized.
  27  *
  28  * All functions can be called static.
  29  *
  30  * See description of forms at http://www.unicode.org/reports/tr15/
  31  *
  32  * @package MediaWiki
  33  */
  34
  35 /** */
  36 require_once 'UtfNormalUtil.php';
  37 require_once 'UtfNormalData.inc';
  38
  39 # Load compatibility decompositions on demand if they are needed.
  40 global $utfCompatibilityDecomp;
  41 $utfCompatibilityDecomp = NULL;
  42
  43 define( 'UNICODE_HANGUL_FIRST', 0xac00 );
  44 define( 'UNICODE_HANGUL_LAST',  0xd7a3 );
  45
  46 define( 'UNICODE_HANGUL_LBASE', 0x1100 );
  47 define( 'UNICODE_HANGUL_VBASE', 0x1161 );
  48 define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
  49
  50 define( 'UNICODE_HANGUL_LCOUNT', 19 );
  51 define( 'UNICODE_HANGUL_VCOUNT', 21 );
  52 define( 'UNICODE_HANGUL_TCOUNT', 28 );
  53 define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
  54
  55 define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
  56 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
  57 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
  58
  59 define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
  60 define( 'UNICODE_SURROGATE_LAST', 0xdfff );
  61 define( 'UNICODE_MAX', 0x10ffff );
  62 define( 'UNICODE_REPLACEMENT', 0xfffd );
  63
  64
  65 define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
  66 define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
  67
  68 define( 'UTF8_HANGUL_LBASE', codepointToUtf8( UNICODE_HANGUL_LBASE ) );
  69 define( 'UTF8_HANGUL_VBASE', codepointToUtf8( UNICODE_HANGUL_VBASE ) );
  70 define( 'UTF8_HANGUL_TBASE', codepointToUtf8( UNICODE_HANGUL_TBASE ) );
  71
  72 define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
  73 define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
  74 define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
  75
  76 define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
  77 define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
  78 define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
  79 define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
  80 #define( 'UTF8_REPLACEMENT', '!' );
  81
  82 define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
  83 define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
  84 define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
  85
  86 # These two ranges are illegal
  87 define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
  88 define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
  89 define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
  90 define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
  91
  92 define( 'UTF8_HEAD', false );
  93 define( 'UTF8_TAIL', true );
  94
  95 /**
  96  *
  97  * @package MediaWiki
  98  */
  99 class UtfNormal {
 100         # The ultimate convenience function! Clean up invalid UTF-8 sequences,
 101         # and convert to normal form C. Faster on pure ASCII strings, or
 102         # secondarily on strings which are already definitely normalized.
 103         function cleanUp( $string ) {
 104                 if( UtfNormal::quickIsNFCVerify( $string ) )
 105                         return $string;
 106                 else
 107                         return UtfNormal::NFC( $string );
 108         }
 109
 110         # These functions try to skip the conversion if it won't be necessary.
 111         # An all ASCII string for instance doesn't need conversion.
 112         function toNFC( $string ) {
 113                 if( UtfNormal::quickIsNFC( $string ) )
 114                         return $string;
 115                 else
 116                         return UtfNormal::NFC( $string );
 117         }
 118
 119         function toNFD( $string ) {
 120                 if( preg_match( '/[\x80-\xff]/', $string ) )
 121                         return UtfNormal::NFD( $string );
 122                 else
 123                         return $string;
 124         }
 125
 126         function toNFKC( $string ) {
 127                 if( preg_match( '/[\x80-\xff]/', $string ) )
 128                         return UtfNormal::NFKC( $string );
 129                 else
 130                         return $string;
 131         }
 132
 133         function toNFKD( $string ) {
 134                 if( preg_match( '/[\x80-\xff]/', $string ) )
 135                         return UtfNormal::NFKD( $string );
 136                 else
 137                         return $string;
 138         }
 139
 140         # Returns true if the string is _definitely_ in NFC.
 141         # Returns false if not or uncertain.
 142         function quickIsNFC( $string ) {
 143                 # ASCII is always valid NFC!
 144                 # If it's pure ASCII and doesn't contain any XML-forbidden chars, let it through.
 145                 if( !preg_match( '/[\x00-\x08\x0b\x0c\x0f-\x1f\x80-\xff]/', $string ) ) return true;
 146
 147                 global $utfCheckNFC, $utfCombiningClass;
 148                 $len = strlen( $string );
 149                 for( $i = 0; $i < $len; $i++ ) {
 150                         $c = $string{$i};
 151                         $n = ord( $c );
 152                         if( $n < 0x80 ) {
 153                                 continue;
 154                         } elseif( $n >= 0xf0 ) {
 155                                 $c = substr( $string, $i, 4 );
 156                                 $i += 3;
 157                         } elseif( $n >= 0xe0 ) {
 158                                 $c = substr( $string, $i, 3 );
 159                                 $i += 2;
 160                         } elseif( $n >= 0xc0 ) {
 161                                 $c = substr( $string, $i, 2 );
 162                                 $i++;
 163                         }
 164                         if( isset( $utfCheckNFC[$c] ) ) {
 165                                 # If it's NO or MAYBE, bail and do the slow check.
 166                                 return false;
 167                         }
 168                         if( isset( $utfCombiningClass[$c] ) ) {
 169                                 # Combining character? We might have to do sorting, at least.
 170                                 return false;
 171                         }
 172                 }
 173                 return true;
 174         }
 175
 176         # As above, but also *alter the string* to strip invalid UTF-8 sequences.
 177         function quickIsNFCVerify( &$string ) {
 178                 # ASCII is always valid NFC!
 179                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 180
 181                 global $utfCheckNFC, $utfCombiningClass;
 182                 $len = strlen( $string );
 183                 $out = '';
 184                 $state = UTF8_HEAD;
 185                 $looksNormal = true;
 186
 187                 $rep = false;
 188                 $head = 0;
 189                 for( $i = 0; $i < $len; $i++ ) {
 190                         $c = $string{$i};
 191                         $n = ord( $c );
 192                         if( $state == UTF8_TAIL ) {
 193                                 if( $n >= 0x80 && $n < 0xc0 ) {
 194                                         $sequence .= $c;
 195                                         if( --$remaining == 0 ) {
 196                                                 if( ($sequence >= UTF8_SURROGATE_FIRST
 197                                                                 && $sequence <= UTF8_SURROGATE_LAST)
 198                                                         || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
 199                                                         || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
 200                                                         || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
 201                                                         || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
 202                                                         || ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
 203                                                         || ($sequence == UTF8_FFFE)
 204                                                         || ($sequence == UTF8_FFFF)
 205                                                         || ($sequence > UTF8_MAX) ) {
 206                                                         $out .= UTF8_REPLACEMENT;
 207                                                         $state = UTF8_HEAD;
 208                                                         continue;
 209                                                 }
 210                                                 if( isset( $utfCheckNFC[$sequence] ) ||
 211                                                         isset( $utfCombiningClass[$sequence] ) ) {
 212                                                         # If it's NO or MAYBE, we'll have to do the slow check.
 213                                                         $looksNormal = false;
 214                                                 }
 215                                                 $out .= $sequence;
 216                                                 $state = UTF8_HEAD;
 217                                                 $head = 0;
 218                                         }
 219                                         continue;
 220                                 }
 221                                 # Not a valid tail byte! DIscard the char we've been building.
 222                                 #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
 223                                 $state = UTF8_HEAD;
 224                                 $out .= UTF8_REPLACEMENT;
 225                         }
 226                         if( $n < 0x09 ) {
 227                                 $out .= UTF8_REPLACEMENT;
 228                         } elseif( $n == 0x0a ) {
 229                                 $out .= $c;
 230                         } elseif( $n < 0x0d ) {
 231                                 $out .= UTF8_REPLACEMENT;
 232                         } elseif( $n == 0x0d ) {
 233                                 # Strip \r silently
 234                         } elseif( $n < 0x20 ) {
 235                                 $out .= UTF8_REPLACEMENT;
 236                         } elseif( $n < 0x80 ) {
 237                                 $out .= $c;
 238                         } elseif( $n < 0xc0 ) {
 239                                 # illegal tail bytes or head byte of overlong sequence
 240                                 if( $head == 0 ) $out .= UTF8_REPLACEMENT;
 241                         } elseif( $n < 0xe0 ) {
 242                                 $state = UTF8_TAIL;
 243                                 $remaining = 1;
 244                                 $sequence = $c;
 245                                 $head = $n;
 246                         } elseif( $n < 0xf0 ) {
 247                                 $state = UTF8_TAIL;
 248                                 $remaining = 2;
 249                                 $sequence = $c;
 250                                 $head = $n;
 251                         } elseif( $n < 0xf8 ) {
 252                                 $state = UTF8_TAIL;
 253                                 $remaining = 3;
 254                                 $sequence = $c;
 255                                 $head = $n;
 256                         } elseif( $n < 0xfc ) {
 257                                 $state = UTF8_TAIL;
 258                                 $remaining = 4;
 259                                 $sequence = $c;
 260                                 $head = $n;
 261                         } elseif( $n < 0xfe ) {
 262                                 $state = UTF8_TAIL;
 263                                 $remaining = 5;
 264                                 $sequence = $c;
 265                                 $head = $n;
 266                         } else {
 267                                 $out .= UTF8_REPLACEMENT;
 268                         }
 269                 }
 270                 if( $state == UTF8_TAIL ) {
 271                         $out .= UTF8_REPLACEMENT;
 272                 }
 273                 $string = $out;
 274                 return $looksNormal;
 275         }
 276
 277         # These take a string and run the normalization on them, without
 278         # checking for validity or any optimization etc. Input must be
 279         # VALID UTF-8!
 280         function NFC( $string ) {
 281                 return $out = UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
 282         }
 283
 284         function NFD( $string ) {
 285                 global $utfCanonicalDecomp;
 286                 return UtfNormal::fastCombiningSort(
 287                         UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
 288         }
 289
 290         function NFKC( $string ) {
 291                 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
 292         }
 293
 294         function NFKD( $string ) {
 295                 global $utfCompatibilityDecomp;
 296                 if( !isset( $utfCompatibilityDecomp ) ) {
 297                         require_once( 'UtfNormalDataK.inc' );
 298                 }
 299                 return UtfNormal::fastCombiningSort(
 300                         UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
 301         }
 302
 303
 304         /* Private */
 305         # Perform decomposition of a UTF-8 string into either D or KD form
 306         # (depending on which decomposition map is passed to us).
 307         # Input is assumed to be *valid* UTF-8. Invalid code will break.
 308         function fastDecompose( &$string, &$map ) {
 309                 $len = strlen( $string );
 310                 $out = '';
 311                 for( $i = 0; $i < $len; $i++ ) {
 312                         $c = $string{$i};
 313                         $n = ord( $c );
 314                         if( $n < 0x80 ) {
 315                                 # ASCII chars never decompose
 316                                 # THEY ARE IMMORTAL
 317                                 $out .= $c;
 318                                 continue;
 319                         } elseif( $n >= 0xf0 ) {
 320                                 $c = substr( $string, $i, 4 );
 321                                 $i += 3;
 322                         } elseif( $n >= 0xe0 ) {
 323                                 $c = substr( $string, $i, 3 );
 324                                 $i += 2;
 325                         } elseif( $n >= 0xc0 ) {
 326                                 $c = substr( $string, $i, 2 );
 327                                 $i++;
 328                         }
 329                         if( isset( $map[$c] ) ) {
 330                                 $out .= $map[$c];
 331                         } else {
 332                                 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
 333                                         $out .= UtfNormal::decomposeHangul( $c );
 334                                 } else {
 335                                         $out .= $c;
 336                                 }
 337                         }
 338                 }
 339                 return $out;
 340         }
 341
 342         function decomposeHangul( $c ) {
 343                 $codepoint = utf8ToCodepoint( $c );
 344                 $index = $codepoint - UNICODE_HANGUL_FIRST;
 345                 $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
 346                 $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
 347                 $t = $index % UNICODE_HANGUL_TCOUNT;
 348                 $out = codepointToUtf8( $l + UNICODE_HANGUL_LBASE );
 349                 $out .= codepointToUtf8( $v + UNICODE_HANGUL_VBASE );
 350                 if( $t ) $out .= codepointToUtf8( $t + UNICODE_HANGUL_TBASE );
 351                 return $out;
 352         }
 353
 354         # Sorts combining characters into canonical order. This is the
 355         # final step in creating decomposed normal forms D and KD.
 356         function fastCombiningSort( $string ) {
 357                 global $utfCombiningClass;
 358                 $replacedCount = 1;
 359                 while( $replacedCount > 0 ) {
 360                         $replacedCount = 0;
 361                         $len = strlen( $string );
 362                         $out = '';
 363                         $lastClass = -1;
 364                         $lastChar = '';
 365                         for( $i = 0; $i < $len; $i++ ) {
 366                                 $c = $string{$i};
 367                                 $n = ord( $c );
 368                                 if( $n >= 0xf0 ) {
 369                                         $c = substr( $string, $i, 4 );
 370                                         $i += 3;
 371                                 } elseif( $n >= 0xe0 ) {
 372                                         $c = substr( $string, $i, 3 );
 373                                         $i += 2;
 374                                 } elseif( $n >= 0xc0 ) {
 375                                         $c = substr( $string, $i, 2 );
 376                                         $i++;
 377                                 }
 378                                 $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 379                                 if( $lastClass == -1 ) {
 380                                         # First one
 381                                         $lastChar = $c;
 382                                         $lastClass = $class;
 383                                 } elseif( $lastClass > $class && $class > 0 ) {
 384                                         # Swap -- put this one on the stack
 385                                         $out .= $c;
 386                                         $replacedCount++;
 387                                 } else {
 388                                         $out .= $lastChar;
 389                                         $lastChar = $c;
 390                                         $lastClass = $class;
 391                                 }
 392                         }
 393                         $out .= $lastChar;
 394                         $string = $out;
 395                 }
 396                 return $string;
 397         }
 398
 399         # Produces canonically composed sequences, i.e. normal form C or KC.
 400         # Input must be valid UTF-8 in sorted normal form D or KD.
 401         function fastCompose( $string ) {
 402                 global $utfCanonicalComp, $utfCombiningClass;
 403                 $len = strlen( $string );
 404                 $out = '';
 405                 $lastClass = -1;
 406                 $startChar = '';
 407                 $combining = '';
 408                 for( $i = 0; $i < $len; $i++ ) {
 409                         $c = $string{$i};
 410                         $n = ord( $c );
 411                         if( $n >= 0xf0 ) {
 412                                 $c = substr( $string, $i, 4 );
 413                                 $i += 3;
 414                         } elseif( $n >= 0xe0 ) {
 415                                 $c = substr( $string, $i, 3 );
 416                                 $i += 2;
 417                         } elseif( $n >= 0xc0 ) {
 418                                 $c = substr( $string, $i, 2 );
 419                                 $i++;
 420                         }
 421                         $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 422                         $pair = $startChar . $c;
 423                         if( empty( $utfCombiningClass[$c] ) ) {
 424                                 # New start char
 425                                 if( $lastClass == 0 && isset( $utfCanonicalComp[$pair] ) ) {
 426                                         $startChar = $utfCanonicalComp[$pair];
 427                                 } elseif( $lastClass == 0 &&
 428                                           $c >= UTF8_HANGUL_VBASE &&
 429                                           $c <= UTF8_HANGUL_VEND &&
 430                                           $startChar >= UTF8_HANGUL_LBASE &&
 431                                           $startChar <= UTF8_HANGUL_LEND ) {
 432                                         $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
 433                                         $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
 434                                         $hangulPoint = UNICODE_HANGUL_FIRST +
 435                                                 UNICODE_HANGUL_TCOUNT *
 436                                                 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
 437                                         $startChar = codepointToUtf8( $hangulPoint );
 438                                 } elseif( $lastClass == 0 &&
 439                                           $c >= UTF8_HANGUL_TBASE &&
 440                                           $c <= UTF8_HANGUL_TEND &&
 441                                           $startChar >= UTF8_HANGUL_FIRST &&
 442                                           $startChar <= UTF8_HANGUL_LAST ) {
 443                                         $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
 444                                         $hangulPoint = utf8ToCodepoint( $startChar ) + $tIndex;
 445                                         $startChar = codepointToUtf8( $hangulPoint );
 446                                 } else {
 447                                         $out .= $startChar;
 448                                         $out .= $combining;
 449                                         $startChar = $c;
 450                                         $combining = '';
 451                                 }
 452                         } else {
 453                                 # A combining char; see what we can do with it
 454                                 if( !empty( $startChar ) &&
 455                                         $lastClass < $class &&
 456                                         $class > 0 &&
 457                                         isset( $utfCanonicalComp[$pair] ) ) {
 458                                         $startChar = $utfCanonicalComp[$pair];
 459                                         $class = 0;
 460                                 } else {
 461                                         $combining .= $c;
 462                                 }
 463                         }
 464                         $lastClass = $class;
 465                 }
 466                 $out .= $startChar . $combining;
 467                 return $out;
 468         }
 469
 470         # This is just used for the benchmark, comparing how long it takes to
 471         # interate through a string without really doing anything of substance.
 472         function placebo( $string ) {
 473                 $len = strlen( $string );
 474                 $out = '';
 475                 for( $i = 0; $i < $len; $i++ ) {
 476                         $out .= $string{$i};
 477                 }
 478                 return $out;
 479         }
 480 }
 481
 482 ?>