includes/normal/UtfNormal.php

   1 <?php
   2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
   3 # http://www.mediawiki.org/
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along
  16 # with this program; if not, write to the Free Software Foundation, Inc.,
  17 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18 # http://www.gnu.org/copyleft/gpl.html
  19
  20 /**
  21  * Unicode normalization routines for working with UTF-8 strings.
  22  * Currently assumes that input strings are valid UTF-8!
  23  *
  24  * Not as fast as I'd like, but should be usable for most purposes.
  25  * UtfNormal::toNFC() will bail early if given ASCII text or text
  26  * it can quickly deterimine is already normalized.
  27  *
  28  * All functions can be called static.
  29  *
  30  * See description of forms at http://www.unicode.org/reports/tr15/
  31  *
  32  * @package MediaWiki
  33  */
  34
  35 /** */
  36 require_once 'UtfNormalUtil.php';
  37
  38 global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;
  39 $utfCombiningClass = NULL;
  40 $utfCanonicalComp = NULL;
  41 $utfCanonicalDecomp = NULL;
  42
  43 # Load compatibility decompositions on demand if they are needed.
  44 global $utfCompatibilityDecomp;
  45 $utfCompatibilityDecomp = NULL;
  46
  47 define( 'UNICODE_HANGUL_FIRST', 0xac00 );
  48 define( 'UNICODE_HANGUL_LAST',  0xd7a3 );
  49
  50 define( 'UNICODE_HANGUL_LBASE', 0x1100 );
  51 define( 'UNICODE_HANGUL_VBASE', 0x1161 );
  52 define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
  53
  54 define( 'UNICODE_HANGUL_LCOUNT', 19 );
  55 define( 'UNICODE_HANGUL_VCOUNT', 21 );
  56 define( 'UNICODE_HANGUL_TCOUNT', 28 );
  57 define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
  58
  59 define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
  60 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
  61 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
  62
  63 define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
  64 define( 'UNICODE_SURROGATE_LAST', 0xdfff );
  65 define( 'UNICODE_MAX', 0x10ffff );
  66 define( 'UNICODE_REPLACEMENT', 0xfffd );
  67
  68
  69 define( 'UTF8_HANGUL_FIRST', codepointToUtf8( UNICODE_HANGUL_FIRST ) );
  70 define( 'UTF8_HANGUL_LAST', codepointToUtf8( UNICODE_HANGUL_LAST ) );
  71
  72 define( 'UTF8_HANGUL_LBASE', codepointToUtf8( UNICODE_HANGUL_LBASE ) );
  73 define( 'UTF8_HANGUL_VBASE', codepointToUtf8( UNICODE_HANGUL_VBASE ) );
  74 define( 'UTF8_HANGUL_TBASE', codepointToUtf8( UNICODE_HANGUL_TBASE ) );
  75
  76 define( 'UTF8_HANGUL_LEND', codepointToUtf8( UNICODE_HANGUL_LEND ) );
  77 define( 'UTF8_HANGUL_VEND', codepointToUtf8( UNICODE_HANGUL_VEND ) );
  78 define( 'UTF8_HANGUL_TEND', codepointToUtf8( UNICODE_HANGUL_TEND ) );
  79
  80 define( 'UTF8_SURROGATE_FIRST', codepointToUtf8( UNICODE_SURROGATE_FIRST ) );
  81 define( 'UTF8_SURROGATE_LAST', codepointToUtf8( UNICODE_SURROGATE_LAST ) );
  82 define( 'UTF8_MAX', codepointToUtf8( UNICODE_MAX ) );
  83 define( 'UTF8_REPLACEMENT', codepointToUtf8( UNICODE_REPLACEMENT ) );
  84 #define( 'UTF8_REPLACEMENT', '!' );
  85
  86 define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
  87 define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
  88 define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
  89
  90 # These two ranges are illegal
  91 define( 'UTF8_FDD0', codepointToUtf8( 0xfdd0 ) );
  92 define( 'UTF8_FDEF', codepointToUtf8( 0xfdef ) );
  93 define( 'UTF8_FFFE', codepointToUtf8( 0xfffe ) );
  94 define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) );
  95
  96 define( 'UTF8_HEAD', false );
  97 define( 'UTF8_TAIL', true );
  98
  99
 100 /**
 101  * For using the ICU wrapper
 102  */
 103 define( 'UNORM_NONE', 1 );
 104 define( 'UNORM_NFD',  2 );
 105 define( 'UNORM_NFKD', 3 );
 106 define( 'UNORM_NFC',  4 );
 107 define( 'UNORM_DEFAULT', UNORM_NFC );
 108 define( 'UNORM_NFKC', 5 );
 109 define( 'UNORM_FCD',  6 );
 110
 111 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
 112
 113 /**
 114  *
 115  * @package MediaWiki
 116  */
 117 class UtfNormal {
 118         /**
 119          * The ultimate convenience function! Clean up invalid UTF-8 sequences,
 120          * and convert to normal form C, canonical composition.
 121          *
 122          * Fast return for pure ASCII strings; some lesser optimizations for
 123          * strings containing only known-good characters. Not as fast as toNFC().
 124          *
 125          * @param string $string a UTF-8 string
 126          * @return string a clean, shiny, normalized UTF-8 string
 127          */
 128         function cleanUp( $string ) {
 129                 if( UtfNormal::quickIsNFCVerify( $string ) )
 130                         return $string;
 131                 else
 132                         return UtfNormal::NFC( $string );
 133         }
 134
 135         /**
 136          * Convert a UTF-8 string to normal form C, canonical composition.
 137          * Fast return for pure ASCII strings; some lesser optimizations for
 138          * strings containing only known-good characters.
 139          *
 140          * @param string $string a valid UTF-8 string. Input is not validated.
 141          * @return string a UTF-8 string in normal form C
 142          */
 143         function toNFC( $string ) {
 144                 if( NORMALIZE_ICU )
 145                         return utf8_normalize( $string, UNORM_NFC );
 146                 elseif( UtfNormal::quickIsNFC( $string ) )
 147                         return $string;
 148                 else
 149                         return UtfNormal::NFC( $string );
 150         }
 151
 152         /**
 153          * Convert a UTF-8 string to normal form D, canonical decomposition.
 154          * Fast return for pure ASCII strings.
 155          *
 156          * @param string $string a valid UTF-8 string. Input is not validated.
 157          * @return string a UTF-8 string in normal form D
 158          */
 159         function toNFD( $string ) {
 160                 if( NORMALIZE_ICU )
 161                         return utf8_normalize( $string, UNORM_NFD );
 162                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
 163                         return UtfNormal::NFD( $string );
 164                 else
 165                         return $string;
 166         }
 167
 168         /**
 169          * Convert a UTF-8 string to normal form KC, compatibility composition.
 170          * This may cause irreversible information loss, use judiciously.
 171          * Fast return for pure ASCII strings.
 172          *
 173          * @param string $string a valid UTF-8 string. Input is not validated.
 174          * @return string a UTF-8 string in normal form KC
 175          */
 176         function toNFKC( $string ) {
 177                 if( NORMALIZE_ICU )
 178                         return utf8_normalize( $string, UNORM_NFKC );
 179                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
 180                         return UtfNormal::NFKC( $string );
 181                 else
 182                         return $string;
 183         }
 184
 185         /**
 186          * Convert a UTF-8 string to normal form KD, compatibility decomposition.
 187          * This may cause irreversible information loss, use judiciously.
 188          * Fast return for pure ASCII strings.
 189          *
 190          * @param string $string a valid UTF-8 string. Input is not validated.
 191          * @return string a UTF-8 string in normal form KD
 192          */
 193         function toNFKD( $string ) {
 194                 if( NORMALIZE_ICU )
 195                         return utf8_normalize( $string, UNORM_NFKD );
 196                 elseif( preg_match( '/[\x80-\xff]/', $string ) )
 197                         return UtfNormal::NFKD( $string );
 198                 else
 199                         return $string;
 200         }
 201
 202         /**
 203          * Load the basic composition data if necessary
 204          * @access private
 205          */
 206         function loadData() {
 207                 global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;
 208                 if( !isset( $utfCombiningClass ) ) {
 209                         require_once( 'UtfNormalData.inc' );
 210                 }
 211         }
 212
 213         /**
 214          * Returns true if the string is _definitely_ in NFC.
 215          * Returns false if not or uncertain.
 216          * @param string $string a valid UTF-8 string. Input is not validated.
 217          * @return bool
 218          */
 219         function quickIsNFC( $string ) {
 220                 # ASCII is always valid NFC!
 221                 # If it's pure ASCII, let it through.
 222                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 223
 224                 UtfNormal::loadData();
 225                 global $utfCheckNFC, $utfCombiningClass;
 226                 $len = strlen( $string );
 227                 for( $i = 0; $i < $len; $i++ ) {
 228                         $c = $string{$i};
 229                         $n = ord( $c );
 230                         if( $n < 0x80 ) {
 231                                 continue;
 232                         } elseif( $n >= 0xf0 ) {
 233                                 $c = substr( $string, $i, 4 );
 234                                 $i += 3;
 235                         } elseif( $n >= 0xe0 ) {
 236                                 $c = substr( $string, $i, 3 );
 237                                 $i += 2;
 238                         } elseif( $n >= 0xc0 ) {
 239                                 $c = substr( $string, $i, 2 );
 240                                 $i++;
 241                         }
 242                         if( isset( $utfCheckNFC[$c] ) ) {
 243                                 # If it's NO or MAYBE, bail and do the slow check.
 244                                 return false;
 245                         }
 246                         if( isset( $utfCombiningClass[$c] ) ) {
 247                                 # Combining character? We might have to do sorting, at least.
 248                                 return false;
 249                         }
 250                 }
 251                 return true;
 252         }
 253
 254         /**
 255          * Returns true if the string is _definitely_ in NFC.
 256          * Returns false if not or uncertain.
 257          * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
 258          * @return bool
 259          */
 260         function quickIsNFCVerify( &$string ) {
 261                 # ASCII is always valid NFC!
 262                 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
 263
 264                 UtfNormal::loadData();
 265                 global $utfCheckNFC, $utfCombiningClass;
 266                 $len = strlen( $string );
 267                 $out = '';
 268                 $state = UTF8_HEAD;
 269                 $looksNormal = true;
 270
 271                 $rep = false;
 272                 $head = 0;
 273                 for( $i = 0; $i < $len; $i++ ) {
 274                         $c = $string{$i};
 275                         $n = ord( $c );
 276                         if( $state == UTF8_TAIL ) {
 277                                 if( $n >= 0x80 && $n < 0xc0 ) {
 278                                         $sequence .= $c;
 279                                         if( --$remaining == 0 ) {
 280                                                 if( ($sequence >= UTF8_SURROGATE_FIRST
 281                                                                 && $sequence <= UTF8_SURROGATE_LAST)
 282                                                         || ($head == 0xc0 && $sequence <= UTF8_OVERLONG_A)
 283                                                         || ($head == 0xc1 && $sequence <= UTF8_OVERLONG_A)
 284                                                         || ($head == 0xe0 && $sequence <= UTF8_OVERLONG_B)
 285                                                         || ($head == 0xf0 && $sequence <= UTF8_OVERLONG_C)
 286                                                         || ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
 287                                                         || ($sequence == UTF8_FFFE)
 288                                                         || ($sequence == UTF8_FFFF)
 289                                                         || ($sequence > UTF8_MAX) ) {
 290                                                         $out .= UTF8_REPLACEMENT;
 291                                                         $state = UTF8_HEAD;
 292                                                         continue;
 293                                                 }
 294                                                 if( isset( $utfCheckNFC[$sequence] ) ||
 295                                                         isset( $utfCombiningClass[$sequence] ) ) {
 296                                                         # If it's NO or MAYBE, we'll have to do the slow check.
 297                                                         $looksNormal = false;
 298                                                 }
 299                                                 $out .= $sequence;
 300                                                 $state = UTF8_HEAD;
 301                                                 $head = 0;
 302                                         }
 303                                         continue;
 304                                 }
 305                                 # Not a valid tail byte! DIscard the char we've been building.
 306                                 #printf ("Invalid '%x' in tail with %d remaining bytes\n", $n, $remaining );
 307                                 $state = UTF8_HEAD;
 308                                 $out .= UTF8_REPLACEMENT;
 309                         }
 310                         if( $n < 0x09 ) {
 311                                 $out .= UTF8_REPLACEMENT;
 312                         } elseif( $n == 0x0a ) {
 313                                 $out .= $c;
 314                         } elseif( $n < 0x0d ) {
 315                                 $out .= UTF8_REPLACEMENT;
 316                         } elseif( $n == 0x0d ) {
 317                                 # Strip \r silently
 318                         } elseif( $n < 0x20 ) {
 319                                 $out .= UTF8_REPLACEMENT;
 320                         } elseif( $n < 0x80 ) {
 321                                 $out .= $c;
 322                         } elseif( $n < 0xc0 ) {
 323                                 # illegal tail bytes or head byte of overlong sequence
 324                                 if( $head == 0 ) $out .= UTF8_REPLACEMENT;
 325                         } elseif( $n < 0xe0 ) {
 326                                 $state = UTF8_TAIL;
 327                                 $remaining = 1;
 328                                 $sequence = $c;
 329                                 $head = $n;
 330                         } elseif( $n < 0xf0 ) {
 331                                 $state = UTF8_TAIL;
 332                                 $remaining = 2;
 333                                 $sequence = $c;
 334                                 $head = $n;
 335                         } elseif( $n < 0xf8 ) {
 336                                 $state = UTF8_TAIL;
 337                                 $remaining = 3;
 338                                 $sequence = $c;
 339                                 $head = $n;
 340                         } elseif( $n < 0xfc ) {
 341                                 $state = UTF8_TAIL;
 342                                 $remaining = 4;
 343                                 $sequence = $c;
 344                                 $head = $n;
 345                         } elseif( $n < 0xfe ) {
 346                                 $state = UTF8_TAIL;
 347                                 $remaining = 5;
 348                                 $sequence = $c;
 349                                 $head = $n;
 350                         } else {
 351                                 $out .= UTF8_REPLACEMENT;
 352                         }
 353                 }
 354                 if( $state == UTF8_TAIL ) {
 355                         $out .= UTF8_REPLACEMENT;
 356                 }
 357                 $string = $out;
 358                 return $looksNormal;
 359         }
 360
 361         # These take a string and run the normalization on them, without
 362         # checking for validity or any optimization etc. Input must be
 363         # VALID UTF-8!
 364         /**
 365          * @param string $string
 366          * @return string
 367          * @access private
 368          */
 369         function NFC( $string ) {
 370                 return $out = UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
 371         }
 372
 373         /**
 374          * @param string $string
 375          * @return string
 376          * @access private
 377          */
 378         function NFD( $string ) {
 379                 UtfNormal::loadData();
 380                 global $utfCanonicalDecomp;
 381                 return UtfNormal::fastCombiningSort(
 382                         UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
 383         }
 384
 385         /**
 386          * @param string $string
 387          * @return string
 388          * @access private
 389          */
 390         function NFKC( $string ) {
 391                 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
 392         }
 393
 394         /**
 395          * @param string $string
 396          * @return string
 397          * @access private
 398          */
 399         function NFKD( $string ) {
 400                 global $utfCompatibilityDecomp;
 401                 if( !isset( $utfCompatibilityDecomp ) ) {
 402                         require_once( 'UtfNormalDataK.inc' );
 403                 }
 404                 return UtfNormal::fastCombiningSort(
 405                         UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
 406         }
 407
 408
 409         /**
 410          * Perform decomposition of a UTF-8 string into either D or KD form
 411          * (depending on which decomposition map is passed to us).
 412          * Input is assumed to be *valid* UTF-8. Invalid code will break.
 413          * @access private
 414          * @param string &$string Valid UTF-8 string
 415          * @param array &$map hash of expanded decomposition map
 416          * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
 417          */
 418         function fastDecompose( &$string, &$map ) {
 419                 UtfNormal::loadData();
 420                 $len = strlen( $string );
 421                 $out = '';
 422                 for( $i = 0; $i < $len; $i++ ) {
 423                         $c = $string{$i};
 424                         $n = ord( $c );
 425                         if( $n < 0x80 ) {
 426                                 # ASCII chars never decompose
 427                                 # THEY ARE IMMORTAL
 428                                 $out .= $c;
 429                                 continue;
 430                         } elseif( $n >= 0xf0 ) {
 431                                 $c = substr( $string, $i, 4 );
 432                                 $i += 3;
 433                         } elseif( $n >= 0xe0 ) {
 434                                 $c = substr( $string, $i, 3 );
 435                                 $i += 2;
 436                         } elseif( $n >= 0xc0 ) {
 437                                 $c = substr( $string, $i, 2 );
 438                                 $i++;
 439                         }
 440                         if( isset( $map[$c] ) ) {
 441                                 $out .= $map[$c];
 442                         } else {
 443                                 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
 444                                         $out .= UtfNormal::decomposeHangul( $c );
 445                                 } else {
 446                                         $out .= $c;
 447                                 }
 448                         }
 449                 }
 450                 return $out;
 451         }
 452
 453         /**
 454          * Decompose a Hangul syllable character into its constituent jamo.
 455          * @access private
 456          * @param int $c Unicode code point of the character
 457          * @return string a UTF-8 string containing a sequence of jamo
 458          */
 459         function decomposeHangul( $c ) {
 460                 $codepoint = utf8ToCodepoint( $c );
 461                 $index = $codepoint - UNICODE_HANGUL_FIRST;
 462                 $l = IntVal( $index / UNICODE_HANGUL_NCOUNT );
 463                 $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
 464                 $t = $index % UNICODE_HANGUL_TCOUNT;
 465                 $out = codepointToUtf8( $l + UNICODE_HANGUL_LBASE );
 466                 $out .= codepointToUtf8( $v + UNICODE_HANGUL_VBASE );
 467                 if( $t ) $out .= codepointToUtf8( $t + UNICODE_HANGUL_TBASE );
 468                 return $out;
 469         }
 470
 471         /**
 472          * Sorts combining characters into canonical order. This is the
 473          * final step in creating decomposed normal forms D and KD.
 474          * @access private
 475          * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
 476          * @return string a UTF-8 string with combining characters sorted in canonical order
 477          */
 478         function fastCombiningSort( $string ) {
 479                 UtfNormal::loadData();
 480                 global $utfCombiningClass;
 481                 $replacedCount = 1;
 482                 while( $replacedCount > 0 ) {
 483                         $replacedCount = 0;
 484                         $len = strlen( $string );
 485                         $out = '';
 486                         $lastClass = -1;
 487                         $lastChar = '';
 488                         for( $i = 0; $i < $len; $i++ ) {
 489                                 $c = $string{$i};
 490                                 $n = ord( $c );
 491                                 if( $n >= 0xf0 ) {
 492                                         $c = substr( $string, $i, 4 );
 493                                         $i += 3;
 494                                 } elseif( $n >= 0xe0 ) {
 495                                         $c = substr( $string, $i, 3 );
 496                                         $i += 2;
 497                                 } elseif( $n >= 0xc0 ) {
 498                                         $c = substr( $string, $i, 2 );
 499                                         $i++;
 500                                 }
 501                                 $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 502                                 if( $lastClass == -1 ) {
 503                                         # First one
 504                                         $lastChar = $c;
 505                                         $lastClass = $class;
 506                                 } elseif( $lastClass > $class && $class > 0 ) {
 507                                         # Swap -- put this one on the stack
 508                                         $out .= $c;
 509                                         $replacedCount++;
 510                                 } else {
 511                                         $out .= $lastChar;
 512                                         $lastChar = $c;
 513                                         $lastClass = $class;
 514                                 }
 515                         }
 516                         $out .= $lastChar;
 517                         $string = $out;
 518                 }
 519                 return $string;
 520         }
 521
 522         /**
 523          * Produces canonically composed sequences, i.e. normal form C or KC.
 524          *
 525          * @access private
 526          * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
 527          * @return string a UTF-8 string with canonical precomposed characters used where possible
 528          */
 529         function fastCompose( $string ) {
 530                 UtfNormal::loadData();
 531                 global $utfCanonicalComp, $utfCombiningClass;
 532                 $len = strlen( $string );
 533                 $out = '';
 534                 $lastClass = -1;
 535                 $startChar = '';
 536                 $combining = '';
 537                 for( $i = 0; $i < $len; $i++ ) {
 538                         $c = $string{$i};
 539                         $n = ord( $c );
 540                         if( $n >= 0xf0 ) {
 541                                 $c = substr( $string, $i, 4 );
 542                                 $i += 3;
 543                         } elseif( $n >= 0xe0 ) {
 544                                 $c = substr( $string, $i, 3 );
 545                                 $i += 2;
 546                         } elseif( $n >= 0xc0 ) {
 547                                 $c = substr( $string, $i, 2 );
 548                                 $i++;
 549                         }
 550                         $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0;
 551                         $pair = $startChar . $c;
 552                         if( empty( $utfCombiningClass[$c] ) ) {
 553                                 # New start char
 554                                 if( $lastClass == 0 && isset( $utfCanonicalComp[$pair] ) ) {
 555                                         $startChar = $utfCanonicalComp[$pair];
 556                                 } elseif( $lastClass == 0 &&
 557                                           $c >= UTF8_HANGUL_VBASE &&
 558                                           $c <= UTF8_HANGUL_VEND &&
 559                                           $startChar >= UTF8_HANGUL_LBASE &&
 560                                           $startChar <= UTF8_HANGUL_LEND ) {
 561                                         $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
 562                                         $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
 563                                         $hangulPoint = UNICODE_HANGUL_FIRST +
 564                                                 UNICODE_HANGUL_TCOUNT *
 565                                                 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
 566                                         $startChar = codepointToUtf8( $hangulPoint );
 567                                 } elseif( $lastClass == 0 &&
 568                                           $c >= UTF8_HANGUL_TBASE &&
 569                                           $c <= UTF8_HANGUL_TEND &&
 570                                           $startChar >= UTF8_HANGUL_FIRST &&
 571                                           $startChar <= UTF8_HANGUL_LAST ) {
 572                                         $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
 573                                         $hangulPoint = utf8ToCodepoint( $startChar ) + $tIndex;
 574                                         $startChar = codepointToUtf8( $hangulPoint );
 575                                 } else {
 576                                         $out .= $startChar;
 577                                         $out .= $combining;
 578                                         $startChar = $c;
 579                                         $combining = '';
 580                                 }
 581                         } else {
 582                                 # A combining char; see what we can do with it
 583                                 if( !empty( $startChar ) &&
 584                                         $lastClass < $class &&
 585                                         $class > 0 &&
 586                                         isset( $utfCanonicalComp[$pair] ) ) {
 587                                         $startChar = $utfCanonicalComp[$pair];
 588                                         $class = 0;
 589                                 } else {
 590                                         $combining .= $c;
 591                                 }
 592                         }
 593                         $lastClass = $class;
 594                 }
 595                 $out .= $startChar . $combining;
 596                 return $out;
 597         }
 598
 599         /**
 600          * This is just used for the benchmark, comparing how long it takes to
 601          * interate through a string without really doing anything of substance.
 602          * @param string $string
 603          * @return string
 604          */
 605         function placebo( $string ) {
 606                 $len = strlen( $string );
 607                 $out = '';
 608                 for( $i = 0; $i < $len; $i++ ) {
 609                         $out .= $string{$i};
 610                 }
 611                 return $out;
 612         }
 613 }
 614
 615 ?>