includes/normal/UtfNormal.php

   1 <?php
   2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
   3 # http://www.mediawiki.org/
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along
  16 # with this program; if not, write to the Free Software Foundation, Inc.,
  17 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18 # http://www.gnu.org/copyleft/gpl.html
  19
  20 # Unicode normalization routines for working with UTF-8 strings.
  21 # Currently assumes that input strings are valid UTF-8!
  22 #
  23 # Not as fast as I'd like, but should be usable for most purposes.
  24 # UtfNormal::toNFC() will bail early if given ASCII text or text
  25 # it can quickly deterimine is already normalized.
  26 #
  27 # All functions can be called static.
  28 #
  29 # See description of forms at http://www.unicode.org/reports/tr15/
  30
  31 require_once 'UtfNormalUtil.php';
  32 require_once 'UtfNormalData.inc';
  33
  34 # Load compatibility decompositions on demand if they are needed.
  35 global $utfCompatibilityDecomp;
  36 $utfCompatibilityDecomp = NULL;
  37
  38 define( 'UNICODE_HANGUL_FIRST', 0xac00 );
  39 define( 'UNICODE_HANGUL_LAST',  0xd7a3 );
  40
  41 define( 'UNICODE_HANGUL_LBASE', 0x1100 );
  42 define( 'UNICODE_HANGUL_VBASE', 0x1161 );
  43 define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
  44
  45 define( 'UNICODE_HANGUL_LCOUNT', 19 );
  46 define( 'UNICODE_HANGUL_VCOUNT', 21 );
  47 define( 'UNICODE_HANGUL_TCOUNT', 28 );
  48 define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
  49
  50 define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
  51 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
  52 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
  53
  54 define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
  55 define( 'UNICODE_SURROGATE_LAST', 0xdfff );
  56 define( 'UNICODE_MAX', 0x10ffff );
  57 define( 'UNICODE_REPLACEMENT', 0xfffd );
  58
  59
  60 define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
  61 define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
  62
  63 define( 'UTF8_HANGUL_LBASE', codepointToUtf8( UNICODE_HANGUL_LBASE ) );
  64 define( 'UTF8_HANGUL_VBASE', codepointToUtf8( UNICODE_HANGUL_VBASE ) );
  65 define( 'UTF8_HANGUL_TBASE', codepointToUtf8( UNICODE_HANGUL_TBASE ) );
  66
  67 define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
  68 define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
  69 define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
  70
  71 define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
  72 define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
  73 define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
  74 define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
  75 #define( 'UTF8_REPLACEMENT', '!' );
  76
  77 define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
  78 define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
  79 define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
  80
  81 # These two ranges are illegal
  82 define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
  83 define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
  84 define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
  85 define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
  86
  87 define( 'UTF8_HEAD', false );
  88 define( 'UTF8_TAIL', true );
  89
  90
  91 class UtfNormal {
  92         # The ultimate convenience function! Clean up invalid UTF-8 sequences,
  93         # and convert to normal form C. Faster on pure ASCII strings, or
  94         # secondarily on strings which are already definitely normalized.
  95         function cleanUp( $string ) {
  96                 if( UtfNormal::quickIsNFCVerify( $string ) )
  97                         return $string;
  98                 else
  99                         return UtfNormal::NFC( $string );
 100         }
 101
 102         # These functions try to skip the conversion if it won't be necessary.
 103         # An all ASCII string for instance doesn't need conversion.
 104         function toNFC( $string ) {
 105                 if( UtfNormal::quickIsNFC( $string ) )
 106                         return $string;
 107                 else
 108                         return UtfNormal::NFC( $string );
 109         }
 110
 111         function toNFD( $string ) {
 112                 if( preg_match( '/[\x80-\xff]/', $string ) )
 113                         return UtfNormal::NFD( $string );
 114                 else
 115                         return $string;
 116         }
 117
 118         function toNFKC( $string ) {
 119                 if( preg_match( '/[\x80-\xff]/', $string ) )
 120                         return UtfNormal::NFKC( $string );
 121                 else
 122                         return $string;
 123         }
 124
 125         function toNFKD( $string ) {
 126                 if( preg_match( '/[\x80-\xff]/', $string ) )
 127                         return UtfNormal::NFKD( $string );
 128                 else
 129                         return $string;
 130         }
 131
 132         # Returns true if the string is _definitely_ in NFC.
 133         # Returns false if not or uncertain.
 134         function quickIsNFC( $string ) {
 135                 # ASCII is always valid NFC!
 136                 # If it's pure ASCII and doesn't contain any XML-forbidden chars, let it through.
 137                 if( !preg_match( '/[\x00-\x08\x0b\x0c\x0f-\x1f\x80-\xff]/', $string ) ) return true;
 138
 139                 global $utfCheckNFC, $utfCombiningClass;
 140                 $len = strlen( $string );
 141                 for( $i = 0; $i < $len; $i++ ) {
 142                         $c = $string{$i};
 143                         $n = ord( $c );
 144                         if( $n < 0x80 ) {
 145                                 continue;
 146                         } elseif( $n >= 0xf0 ) {
 147                                 $c = substr( $string, $i, 4 );
 148                                 $i += 3;
 149                         } elseif( $n >= 0xe0 ) {
 150                                 $c = substr( $string, $i, 3 );
 151                                 $i += 2;
 152                         } elseif( $n >= 0xc0 ) {
 153                                 $c = substr( $string, $i, 2 );
 154                                 $i++;
 155                         }
 156                         if( isset( $utfCheckNFC[$c] ) ) {
 157                                 # If it's NO or MAYBE, bail and do the slow check.
 158                                 return false;
 159                         }
 160                         if( isset( $utfCombiningClass[$c] ) ) {
 161                                 # Combining character? We might have to do sorting, at least.
 162                                 return false;
 163                         }
 164                 }
 165                 return true;
 166         }
 167
 168         # As above, but also *alter the string* to strip invalid UTF-8 sequences.
 169         function quickIsNFCVerify( &$string ) {
 170                 # ASCII is always valid NFC!
 171                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 172
 173                 global $utfCheckNFC, $utfCombiningClass;
 174                 $len = strlen( $string );
 175                 $out = '';
 176                 $state = UTF8_HEAD;
 177                 $looksNormal = true;
 178
 179                 $rep = false;
 180                 $head = 0;
 181                 for( $i = 0; $i < $len; $i++ ) {
 182                         $c = $string{$i};
 183                         $n = ord( $c );
 184                         if( $state == UTF8_TAIL ) {
 185                                 if( $n >= 0x80 && $n < 0xc0 ) {
 186                                         $sequence .= $c;
 187                                         if( --$remaining == 0 ) {
 188                                                 if( ($sequence >= UTF8_SURROGATE_FIRST
 189                                                                 && $sequence <= UTF8_SURROGATE_LAST)
 190                                                         || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
 191                                                         || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
 192                                                         || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
 193                                                         || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
 194                                                         || ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
 195                                                         || ($sequence == UTF8_FFFE)
 196                                                         || ($sequence == UTF8_FFFF)
 197                                                         || ($sequence > UTF8_MAX) ) {
 198                                                         $out .= UTF8_REPLACEMENT;
 199                                                         $state = UTF8_HEAD;
 200                                                         continue;
 201                                                 }
 202                                                 if( isset( $utfCheckNFC[$sequence] ) ||
 203                                                         isset( $utfCombiningClass[$sequence] ) ) {
 204                                                         # If it's NO or MAYBE, we'll have to do the slow check.
 205                                                         $looksNormal = false;
 206                                                 }
 207                                                 $out .= $sequence;
 208                                                 $state = UTF8_HEAD;
 209                                                 $head = 0;
 210                                         }
 211                                         continue;
 212                                 }
 213                                 # Not a valid tail byte! DIscard the char we've been building.
 214                                 #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
 215                                 $state = UTF8_HEAD;
 216                                 $out .= UTF8_REPLACEMENT;
 217                         }
 218                         if( $n < 0x09 ) {
 219                                 $out .= UTF8_REPLACEMENT;
 220                         } elseif( $n == 0x0a ) {
 221                                 $out .= $c;
 222                         } elseif( $n < 0x0d ) {
 223                                 $out .= UTF8_REPLACEMENT;
 224                         } elseif( $n == 0x0d ) {
 225                                 # Strip \r silently
 226                         } elseif( $n < 0x20 ) {
 227                                 $out .= UTF8_REPLACEMENT;
 228                         } elseif( $n < 0x80 ) {
 229                                 $out .= $c;
 230                         } elseif( $n < 0xc0 ) {
 231                                 # illegal tail bytes or head byte of overlong sequence
 232                                 if( $head == 0 ) $out .= UTF8_REPLACEMENT;
 233                         } elseif( $n < 0xe0 ) {
 234                                 $state = UTF8_TAIL;
 235                                 $remaining = 1;
 236                                 $sequence = $c;
 237                                 $head = $n;
 238                         } elseif( $n < 0xf0 ) {
 239                                 $state = UTF8_TAIL;
 240                                 $remaining = 2;
 241                                 $sequence = $c;
 242                                 $head = $n;
 243                         } elseif( $n < 0xf8 ) {
 244                                 $state = UTF8_TAIL;
 245                                 $remaining = 3;
 246                                 $sequence = $c;
 247                                 $head = $n;
 248                         } elseif( $n < 0xfc ) {
 249                                 $state = UTF8_TAIL;
 250                                 $remaining = 4;
 251                                 $sequence = $c;
 252                                 $head = $n;
 253                         } elseif( $n < 0xfe ) {
 254                                 $state = UTF8_TAIL;
 255                                 $remaining = 5;
 256                                 $sequence = $c;
 257                                 $head = $n;
 258                         } else {
 259                                 $out .= UTF8_REPLACEMENT;
 260                         }
 261                 }
 262                 if( $state == UTF8_TAIL ) {
 263                         $out .= UTF8_REPLACEMENT;
 264                 }
 265                 $string = $out;
 266                 return $looksNormal;
 267         }
 268
 269         # These take a string and run the normalization on them, without
 270         # checking for validity or any optimization etc. Input must be
 271         # VALID UTF-8!
 272         function NFC( $string ) {
 273                 return $out = UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
 274         }
 275
 276         function NFD( $string ) {
 277                 global $utfCanonicalDecomp;
 278                 return UtfNormal::fastCombiningSort(
 279                         UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
 280         }
 281
 282         function NFKC( $string ) {
 283                 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
 284         }
 285
 286         function NFKD( $string ) {
 287                 global $utfCompatibilityDecomp;
 288                 if( !isset( $utfCompatibilityDecomp ) ) {
 289                         require_once( 'UtfNormalDataK.inc' );
 290                 }
 291                 return UtfNormal::fastCombiningSort(
 292                         UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
 293         }
 294
 295
 296         /* Private */
 297         # Perform decomposition of a UTF-8 string into either D or KD form
 298         # (depending on which decomposition map is passed to us).
 299         # Input is assumed to be *valid* UTF-8. Invalid code will break.
 300         function fastDecompose( &$string, &$map ) {
 301                 $len = strlen( $string );
 302                 $out = '';
 303                 for( $i = 0; $i < $len; $i++ ) {
 304                         $c = $string{$i};
 305                         $n = ord( $c );
 306                         if( $n < 0x80 ) {
 307                                 # ASCII chars never decompose
 308                                 # THEY ARE IMMORTAL
 309                                 $out .= $c;
 310                                 continue;
 311                         } elseif( $n >= 0xf0 ) {
 312                                 $c = substr( $string, $i, 4 );
 313                                 $i += 3;
 314                         } elseif( $n >= 0xe0 ) {
 315                                 $c = substr( $string, $i, 3 );
 316                                 $i += 2;
 317                         } elseif( $n >= 0xc0 ) {
 318                                 $c = substr( $string, $i, 2 );
 319                                 $i++;
 320                         }
 321                         if( isset( $map[$c] ) ) {
 322                                 $out .= $map[$c];
 323                         } else {
 324                                 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
 325                                         $out .= UtfNormal::decomposeHangul( $c );
 326                                 } else {
 327                                         $out .= $c;
 328                                 }
 329                         }
 330                 }
 331                 return $out;
 332         }
 333
 334         function decomposeHangul( $c ) {
 335                 $codepoint = utf8ToCodepoint( $c );
 336                 $index = $codepoint - UNICODE_HANGUL_FIRST;
 337                 $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
 338                 $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
 339                 $t = $index % UNICODE_HANGUL_TCOUNT;
 340                 $out = codepointToUtf8( $l + UNICODE_HANGUL_LBASE );
 341                 $out .= codepointToUtf8( $v + UNICODE_HANGUL_VBASE );
 342                 if( $t ) $out .= codepointToUtf8( $t + UNICODE_HANGUL_TBASE );
 343                 return $out;
 344         }
 345
 346         # Sorts combining characters into canonical order. This is the
 347         # final step in creating decomposed normal forms D and KD.
 348         function fastCombiningSort( $string ) {
 349                 global $utfCombiningClass;
 350                 $replacedCount = 1;
 351                 while( $replacedCount > 0 ) {
 352                         $replacedCount = 0;
 353                         $len = strlen( $string );
 354                         $out = '';
 355                         $lastClass = -1;
 356                         $lastChar = '';
 357                         for( $i = 0; $i < $len; $i++ ) {
 358                                 $c = $string{$i};
 359                                 $n = ord( $c );
 360                                 if( $n >= 0xf0 ) {
 361                                         $c = substr( $string, $i, 4 );
 362                                         $i += 3;
 363                                 } elseif( $n >= 0xe0 ) {
 364                                         $c = substr( $string, $i, 3 );
 365                                         $i += 2;
 366                                 } elseif( $n >= 0xc0 ) {
 367                                         $c = substr( $string, $i, 2 );
 368                                         $i++;
 369                                 }
 370                                 $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 371                                 if( $lastClass == -1 ) {
 372                                         # First one
 373                                         $lastChar = $c;
 374                                         $lastClass = $class;
 375                                 } elseif( $lastClass > $class && $class > 0 ) {
 376                                         # Swap -- put this one on the stack
 377                                         $out .= $c;
 378                                         $replacedCount++;
 379                                 } else {
 380                                         $out .= $lastChar;
 381                                         $lastChar = $c;
 382                                         $lastClass = $class;
 383                                 }
 384                         }
 385                         $out .= $lastChar;
 386                         $string = $out;
 387                 }
 388                 return $string;
 389         }
 390
 391         # Produces canonically composed sequences, i.e. normal form C or KC.
 392         # Input must be valid UTF-8 in sorted normal form D or KD.
 393         function fastCompose( $string ) {
 394                 global $utfCanonicalComp, $utfCombiningClass;
 395                 $len = strlen( $string );
 396                 $out = '';
 397                 $lastClass = -1;
 398                 $startChar = '';
 399                 $combining = '';
 400                 for( $i = 0; $i < $len; $i++ ) {
 401                         $c = $string{$i};
 402                         $n = ord( $c );
 403                         if( $n >= 0xf0 ) {
 404                                 $c = substr( $string, $i, 4 );
 405                                 $i += 3;
 406                         } elseif( $n >= 0xe0 ) {
 407                                 $c = substr( $string, $i, 3 );
 408                                 $i += 2;
 409                         } elseif( $n >= 0xc0 ) {
 410                                 $c = substr( $string, $i, 2 );
 411                                 $i++;
 412                         }
 413                         $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 414                         $pair = $startChar . $c;
 415                         if( empty( $utfCombiningClass[$c] ) ) {
 416                                 # New start char
 417                                 if( $lastClass == 0 && isset( $utfCanonicalComp[$pair] ) ) {
 418                                         $startChar = $utfCanonicalComp[$pair];
 419                                 } elseif( $lastClass == 0 &&
 420                                           $c >= UTF8_HANGUL_VBASE &&
 421                                           $c <= UTF8_HANGUL_VEND &&
 422                                           $startChar >= UTF8_HANGUL_LBASE &&
 423                                           $startChar <= UTF8_HANGUL_LEND ) {
 424                                         $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
 425                                         $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
 426                                         $hangulPoint = UNICODE_HANGUL_FIRST +
 427                                                 UNICODE_HANGUL_TCOUNT *
 428                                                 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
 429                                         $startChar = codepointToUtf8( $hangulPoint );
 430                                 } elseif( $lastClass == 0 &&
 431                                           $c >= UTF8_HANGUL_TBASE &&
 432                                           $c <= UTF8_HANGUL_TEND &&
 433                                           $startChar >= UTF8_HANGUL_FIRST &&
 434                                           $startChar <= UTF8_HANGUL_LAST ) {
 435                                         $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
 436                                         $hangulPoint = utf8ToCodepoint( $startChar ) + $tIndex;
 437                                         $startChar = codepointToUtf8( $hangulPoint );
 438                                 } else {
 439                                         $out .= $startChar;
 440                                         $out .= $combining;
 441                                         $startChar = $c;
 442                                         $combining = '';
 443                                 }
 444                         } else {
 445                                 # A combining char; see what we can do with it
 446                                 if( !empty( $startChar ) &&
 447                                         $lastClass < $class &&
 448                                         $class > 0 &&
 449                                         isset( $utfCanonicalComp[$pair] ) ) {
 450                                         $startChar = $utfCanonicalComp[$pair];
 451                                         $class = 0;
 452                                 } else {
 453                                         $combining .= $c;
 454                                 }
 455                         }
 456                         $lastClass = $class;
 457                 }
 458                 $out .= $startChar . $combining;
 459                 return $out;
 460         }
 461
 462         # This is just used for the benchmark, comparing how long it takes to
 463         # interate through a string without really doing anything of substance.
 464         function placebo( $string ) {
 465                 $len = strlen( $string );
 466                 $out = '';
 467                 for( $i = 0; $i < $len; $i++ ) {
 468                         $out .= $string{$i};
 469                 }
 470                 return $out;
 471         }
 472 }
 473
 474 ?>