includes/normal/UtfNormal.php

   1 <?php
   2 /**
   3 * Unicode normalization routines
   4 *
   5 * Copyright (C) 2006 Ludovic ARNAUD <ludovic.arnaud@gmail.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2 of the License, or
  10 * (at your option) any later version.
  11 *
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 * GNU General Public License for more details.
  16 *
  17 * You should have received a copy of the GNU General Public License along
  18 * with this program; if not, write to the Free Software Foundation, Inc.,
  19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  20 * http://www.gnu.org/copyleft/gpl.html
  21 *
  22 * @author       Ludovic ARNAUD <ludovic.arnaud@gmail.com>
  23 * @license      http://www.gnu.org/licenses/gpl.txt
  24 * @package      UtfNormal
  25 */
  26
  27 require_once ('UtfNormalDefines.php');
  28
  29 if( function_exists( 'utf8_normalize' ) ) {
  30
  31 ////////////////////////////////////////////////////////////////////////////////
  32 //              Wrapper for the utfnormal extension, ICU wrapper              //
  33 ////////////////////////////////////////////////////////////////////////////////
  34
  35 /**
  36 * UtfNormal class for the utfnormal extension
  37 *
  38 * @ignore
  39 */
  40 class UtfNormal {
  41         function cleanUp( $str ) {
  42                 /**
  43                 * The string below is the list of all autorized characters, sorted by
  44                 * frequency in latin text
  45                 */
  46                 $pos = strspn(
  47                         $str,
  48                         "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D"
  49                 );
  50
  51                 if( !isset( $str[$pos] ) ) {
  52                         /**
  53                         * ASCII strings with no special chars return immediately
  54                         */
  55                         return $str;
  56                 }
  57
  58                 /**
  59                 * Check if there is potentially a 0xFFFE or 0xFFFF char (UTF sequence
  60                 * 0xEFBFBE or 0xEFBFBF) and replace them
  61                 *
  62                 * Note: we start searching at position $pos
  63                 */
  64                 if( is_int( strpos( $str, "\xEF\xBF", $pos ) ) ) {
  65                         $str = str_replace(
  66                                 array( "\xEF\xBF\xBE", "\xEF\xBF\xBF" ),
  67                                 array( UTF8_REPLACEMENT, UTF8_REPLACEMENT ),
  68                                 $str
  69                         );
  70                 }
  71
  72                 /**
  73                 * Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
  74                 *
  75                 * We replace those characters with a 0xFF byte, which is illegal in
  76                 * UTF-8 and will in turn be replaced with a Unicode replacement char
  77                 */
  78                 $str = strtr(
  79                         $str,
  80                         "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
  81                         "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
  82                 );
  83
  84                 /**
  85                 * As per the original implementation, "the UnicodeString constructor fails
  86                 * if the string ends with a head byte". Therefore, if the string ends with
  87                 * a leading byte we replace it with 0xFF, which is illegal too and will be
  88                 * replaced with a Unicode replacement character
  89                 */
  90                 if( substr( $str, -1 ) >= "\xC0" ) {
  91                         $str[strlen($str) - 1] = "\xFF";
  92                 }
  93
  94                 return utf8_normalize( $str, UNORM_NFC );
  95         }
  96
  97         function toNFC( $str ) {
  98                 return utf8_normalize( $str, UNORM_NFC );
  99         }
 100
 101         function toNFKC( $str ) {
 102                 return utf8_normalize( $str, UNORM_NFKC );
 103         }
 104
 105         function toNFD( $str ) {
 106                 return utf8_normalize( $str, UNORM_NFD );
 107         }
 108
 109         function toNFKD( $str ) {
 110                 return utf8_normalize( $str, UNORM_NFKD );
 111         }
 112 }
 113
 114 ////////////////////////////////////////////////////////////////////////////////
 115 //                           End of the ICU wrapper                           //
 116 ////////////////////////////////////////////////////////////////////////////////
 117
 118
 119 } else {
 120
 121
 122 ////////////////////////////////////////////////////////////////////////////////
 123 //        This block will NOT be loaded if the utfnormal extension is         //
 124 ////////////////////////////////////////////////////////////////////////////////
 125
 126 /**
 127 * Unset global variables
 128 */
 129 unset( $GLOBALS['utfJamoIndex'], $GLOBALS['utfJamoType'], $GLOBALS['utfCheckNFC'], $GLOBALS['utfCombiningClass'], $GLOBALS['utfCanonicalComp'], $GLOBALS['utfCanonicalDecomp'], $GLOBALS['utfCheckNFKC'], $GLOBALS['utfCompatibilityDecomp'] );
 130
 131 /**
 132 * NFC_QC and NFKC_QC values
 133 */
 134 define( 'UNICODE_QC_MAYBE', 0 );
 135 define( 'UNICODE_QC_NO', 1 );
 136
 137 /**
 138 * Contains all the ASCII characters appearing in UTF-8, sorted by frequency
 139 */
 140 define( 'UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F" );
 141
 142 /**
 143 * Contains all the tail bytes that can appear in the composition of a UTF-8 char
 144 */
 145 define( 'UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A" );
 146
 147 /**
 148 * Unicode normalization routines
 149 *
 150 * A copy of reports of bugs related to this class can be sent to the author directly
 151 *
 152 * @package UtfNormal
 153 */
 154 class UtfNormal {
 155         /**
 156         * Validate, cleanup and normalize a string
 157         *
 158         * The ultimate convenience function! Clean up invalid UTF-8 sequences,
 159         * and convert to Normal Form C, canonical composition.
 160         *
 161         * @param        string  $str    The dirty string
 162         * @return       string                  The same string, all shiny and cleaned-up
 163         */
 164         function cleanup( $str ) {
 165                 /**
 166                 * The string below is the list of all autorized characters, sorted by
 167                 * frequency in latin text
 168                 */
 169                 $pos = strspn( $str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D" );
 170                 $len = strlen( $str );
 171
 172                 if( $pos == $len ) {
 173                         /**
 174                         * ASCII strings with no special chars return immediately
 175                         */
 176                         return $str;
 177                 }
 178
 179                 /**
 180                 * Note: we do not check for $GLOBALS['utfCanonicalDecomp']. It is assumed
 181                 * they are always loaded together
 182                 */
 183                 if( !isset( $GLOBALS['utfCheckNFC'] ) ) {
 184                         include( 'UtfNormalData.inc' );
 185                 }
 186
 187                 /**
 188                 * Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
 189                 *
 190                 * We replace those characters with a 0xFF byte, which is illegal in
 191                 * UTF-8 and will in turn be replaced with a UTF replacement char
 192                 */
 193                 return UtfNormal::recompose(
 194                         strtr(
 195                                 $str,
 196                                 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
 197                                 "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
 198                         ),
 199                         $pos, $len, $GLOBALS['utfCheckNFC'], $GLOBALS['utfCanonicalDecomp']
 200                 );
 201         }
 202
 203         /**
 204         * Validate and normalize a UTF string to NFC
 205         *
 206         * @param        string  $str    Unchecked UTF string
 207         * @return       string                  The string, validated and in normal form
 208         */
 209         function toNFC( $str ) {
 210                 $pos = strspn( $str, UTF8_ASCII_RANGE );
 211                 $len = strlen( $str );
 212
 213                 if( $pos == $len ) {
 214                         /**
 215                         * ASCII strings return immediately
 216                         */
 217                         return $str;
 218                 }
 219
 220                 if( !isset( $GLOBALS['utfCheckNFC'] ) ) {
 221                         include( 'UtfNormalData.inc' );
 222                 }
 223
 224                 return UtfNormal::recompose( $str, $pos, $len, $GLOBALS['utfCheckNFC'], $GLOBALS['utfCanonicalDecomp'] );
 225         }
 226
 227         /**
 228         * Validate and normalize a UTF string to NFKC
 229         *
 230         * @param        string  $str    Unchecked UTF string
 231         * @return       string                  The string, validated and in normal form
 232         */
 233         function toNFKC( $str ) {
 234                 $pos = strspn( $str, UTF8_ASCII_RANGE );
 235                 $len = strlen( $str );
 236
 237                 if( $pos == $len ) {
 238                         /**
 239                         * ASCII strings return immediately
 240                         */
 241                         return $str;
 242                 }
 243
 244                 if( !isset( $GLOBALS['utfCheckNFKC'] ) ) {
 245                         include( 'UtfNormalDataK.inc' );
 246                 }
 247                 if( !isset( $GLOBALS['utfCanonicalComp'] ) ) {
 248                         include( 'UtfNormalData.inc' );
 249                 }
 250
 251                 return UtfNormal::recompose( $str, $pos, $len, $GLOBALS['utfCheckNFKC'], $GLOBALS['utfCompatibilityDecomp'] );
 252         }
 253
 254         /**
 255         * Validate and normalize a UTF string to NFD
 256         *
 257         * @param        string  $str    Unchecked UTF string
 258         * @return       string                  The string, validated and in normal form
 259         */
 260         function toNFD( $str ) {
 261                 $pos = strspn( $str, UTF8_ASCII_RANGE );
 262                 $len = strlen( $str );
 263
 264                 if( $pos == $len ) {
 265                         /**
 266                         * ASCII strings return immediately
 267                         */
 268                         return $str;
 269                 }
 270
 271                 if( !isset( $GLOBALS['utfCanonicalDecomp'] ) ) {
 272                         include( 'UtfNormalData.inc' );
 273                 }
 274
 275                 return UtfNormal::decompose( $str, $pos, $len, $GLOBALS['utfCanonicalDecomp'] );
 276         }
 277
 278         /**
 279         * Validate and normalize a UTF string to NFKD
 280         *
 281         * @param        string  $str    Unchecked UTF string
 282         * @return       string                  The string, validated and in normal form
 283         */
 284         function toNFKD( $str ) {
 285                 $pos = strspn( $str, UTF8_ASCII_RANGE );
 286                 $len = strlen( $str );
 287
 288                 if( $pos == $len ) {
 289                         /**
 290                         * ASCII strings return immediately
 291                         */
 292                         return $str;
 293                 }
 294
 295                 if( !isset( $GLOBALS['utfCompatibilityDecomp'] ) ) {
 296                         include( 'UtfNormalDataK.inc' );
 297                 }
 298
 299                 return UtfNormal::decompose( $str, $pos, $len, $GLOBALS['utfCompatibilityDecomp'] );
 300         }
 301
 302
 303         ////////////////////////////////////////////////////////////////////////////
 304         //                           Internal functions                           //
 305         ////////////////////////////////////////////////////////////////////////////
 306
 307         /**
 308         * Recompose a UTF string
 309         *
 310         * @param        string  $str            Unchecked UTF string
 311         * @param        integer $pos            Position of the first UTF char (in bytes)
 312         * @param        integer $len            Length of the string (in bytes)
 313         * @param        array   $qc                     Quick-check array, passed by reference but never modified
 314         * @param        array   $decomp_map     Decomposition mapping, passed by reference but never modified
 315         * @return       string                          The string, validated and recomposed
 316         *
 317         * @access       private
 318         */
 319         function recompose( $str, $pos, $len, &$qc, &$decomp_map ) {
 320                 global $utfCombiningClass, $utfCanonicalComp, $utfJamoType, $utfJamoIndex;
 321
 322                 /**
 323                 * Buffer the last ASCII char before the UTF-8 stuff if applicable
 324                 */
 325                 $tmp = '';
 326                 $i = $tmp_pos = $last_cc = 0;
 327
 328                 if( $pos ) {
 329                         $buffer = array(++$i => $str[$pos - 1] );
 330                 } else {
 331                         $buffer = array();
 332                 }
 333
 334                 /**
 335                 * UTF char length array
 336                 *
 337                 * This array is used to determine the length of a UTF character. Be $c the
 338                 * result of ($str[$pos] & "\xF0") --where $str is the string we're operating
 339                 * on and $pos the position of the cursor--, if $utf_len_mask[$c] does not
 340                 * exist, the byte is an ASCII char. Otherwise, if $utf_len_mask[$c] is greater
 341                 * than 0, we have a the leading byte of a multibyte character whose length is
 342                 * $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
 343                 */
 344                 $utf_len_mask = array(
 345                         /**
 346                         * Leading bytes masks
 347                         */
 348                         "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
 349
 350                         /**
 351                         * Trailing bytes masks
 352                         */
 353                         "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
 354                 );
 355
 356                 $extra_check = array(
 357                         "\xED"=>1, "\xEF"=>1, "\xC0"=>1, "\xC1"=>1, "\xE0"=>1, "\xF0"=>1,
 358                         "\xF4"=>1, "\xF5"=>1, "\xF6"=>1, "\xF7"=>1, "\xF8"=>1, "\xF9"=>1,
 359                         "\xFA"=>1, "\xFB"=>1, "\xFC"=>1, "\xFD"=>1, "\xFE"=>1, "\xFF"=>1
 360                 );
 361
 362                 $utf_validation_mask = array(
 363                         2       =>      "\xE0\xC0",
 364                         3       =>      "\xF0\xC0\xC0",
 365                         4       =>      "\xF8\xC0\xC0\xC0"
 366                 );
 367
 368                 $utf_validation_check = array(
 369                         2       =>      "\xC0\x80",
 370                         3       =>      "\xE0\x80\x80",
 371                         4       =>      "\xF0\x80\x80\x80"
 372                 );
 373
 374                 ////////////////////////////////////////////////////////////////////////
 375                 //                             Main loop                              //
 376                 ////////////////////////////////////////////////////////////////////////
 377
 378                 do {
 379                         ////////////////////////////////////////////////////////////////////
 380                         //         STEP 0: Capture the current char and buffer it         //
 381                         ////////////////////////////////////////////////////////////////////
 382
 383                         $c = $str[$pos];
 384                         $c_mask = $c & "\xF0";
 385
 386                         if( isset( $utf_len_mask[$c_mask] ) ) {
 387                                 /**
 388                                 * Byte at $pos is either a leading byte or a missplaced trailing byte
 389                                 */
 390                                 if( $utf_len = $utf_len_mask[$c_mask] ) {
 391                                         /**
 392                                         * Capture the char
 393                                         */
 394                                         $buffer[++$i & 7] = $utf_char = substr( $str, $pos, $utf_len );
 395
 396                                         /**
 397                                         * Let's find out if a thorough check is needed
 398                                         */
 399                                         if( isset( $qc[$utf_char] ) ) {
 400                                                 /**
 401                                                 * If the UTF char is in the qc array then it may not be in normal
 402                                                 * form. We do nothing here, the actual processing is below this
 403                                                 * "if" block
 404                                                 */
 405                                         } elseif( isset( $utfCombiningClass[$utf_char] ) ) {
 406                                                 if( $utfCombiningClass[$utf_char] < $last_cc ) {
 407                                                         /**
 408                                                         * A combining character that is NOT canonically ordered
 409                                                         */
 410                                                 } else {
 411                                                         /**
 412                                                         * A combining character that IS canonically ordered, skip
 413                                                         * to the next char
 414                                                         */
 415                                                         $last_cc = $utfCombiningClass[$utf_char];
 416
 417                                                         $pos += $utf_len;
 418                                                         continue;
 419                                                 }
 420                                         } else {
 421                                                 /**
 422                                                 * At this point, $utf_char holds a UTF char that we know
 423                                                 * is not a NF[K]C_QC and is not a combining character. It can
 424                                                 * be a singleton, a canonical composite, a replacement char or
 425                                                 * an even an ill-formed bunch of bytes. Let's find out
 426                                                 */
 427                                                 $last_cc = 0;
 428
 429                                                 /**
 430                                                 * Check that we have the correct number of trailing bytes
 431                                                 */
 432                                                 if( ( $utf_char & $utf_validation_mask[$utf_len] ) != $utf_validation_check[$utf_len] ) {
 433                                                         /**
 434                                                         * Current char isn't well-formed or legal: either one or
 435                                                         * several trailing bytes are missing, or the Unicode char
 436                                                         * has been encoded in a five- or six- byte sequence
 437                                                         */
 438                                                         if( $utf_char[0] >= "\xF8" ) {
 439                                                                 if( $utf_char[0] < "\xF8" ) {
 440                                                                         $trailing_bytes = 3;
 441                                                                 } elseif( $utf_char[0] < "\xFC" ) {
 442                                                                         $trailing_bytes = 4;
 443                                                                 }
 444                                                                 if( $utf_char[0] > "\xFD" ) {
 445                                                                         $trailing_bytes = 0;
 446                                                                 } else {
 447                                                                         $trailing_bytes = 5;
 448                                                                 }
 449                                                         } else {
 450                                                                 $trailing_bytes = $utf_len - 1;
 451                                                         }
 452
 453                                                         $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
 454                                                         $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
 455                                                         $tmp_pos = $pos;
 456
 457                                                         continue;
 458                                                 }
 459
 460                                                 if( isset( $extra_check[$c] ) ) {
 461                                                         switch( $c ) {
 462                                                                 /**
 463                                                                 * Note: 0xED is quite common in Korean
 464                                                                 */
 465                                                                 case "\xED":
 466                                                                         if( $utf_char >= "\xED\xA0\x80" ) {
 467                                                                                 /**
 468                                                                                 * Surrogates (0xD800..0xDFFF) are not allowed in UTF-8
 469                                                                                 * (UTF sequence 0xEDA080..0xEDBFBF)
 470                                                                                 */
 471                                                                                 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
 472                                                                                 $pos += $utf_len;
 473                                                                                 $tmp_pos = $pos;
 474                                                                                 continue 2;
 475                                                                         }
 476                                                                         break;
 477
 478                                                                 /**
 479                                                                 * Note: 0xEF is quite common in Japanese
 480                                                                 */
 481                                                                 case "\xEF":
 482                                                                         if( $utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF" ) {
 483                                                                                 /**
 484                                                                                 * 0xFFFE and 0xFFFF are explicitly disallowed
 485                                                                                 * (UTF sequence 0xEFBFBE..0xEFBFBF)
 486                                                                                 */
 487                                                                                 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
 488                                                                                 $pos += $utf_len;
 489                                                                                 $tmp_pos = $pos;
 490                                                                                 continue 2;
 491                                                                         }
 492                                                                         break;
 493
 494                                                                 case "\xC0":
 495                                                                 case "\xC1":
 496                                                                         if( $utf_char <= "\xC1\xBF" ) {
 497                                                                                 /**
 498                                                                                 * Overlong sequence: Unicode char 0x00..0x7F encoded as a
 499                                                                                 * double-byte UTF char
 500                                                                                 */
 501                                                                                 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
 502                                                                                 $pos += $utf_len;
 503                                                                                 $tmp_pos = $pos;
 504                                                                                 continue 2;
 505                                                                         }
 506                                                                         break;
 507
 508                                                                 case "\xE0":
 509                                                                         if( $utf_char <= "\xE0\x9F\xBF" ) {
 510                                                                                 /**
 511                                                                                 * Unicode char 0x0000..0x07FF encoded in 3 bytes
 512                                                                                 */
 513                                                                                 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
 514                                                                                 $pos += $utf_len;
 515                                                                                 $tmp_pos = $pos;
 516                                                                                 continue 2;
 517                                                                         }
 518                                                                         break;
 519
 520                                                                 case "\xF0":
 521                                                                         if( $utf_char <= "\xF0\x8F\xBF\xBF" ) {
 522                                                                                 /**
 523                                                                                 * Unicode char 0x0000..0xFFFF encoded in 4 bytes
 524                                                                                 */
 525                                                                                 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
 526                                                                                 $pos += $utf_len;
 527                                                                                 $tmp_pos = $pos;
 528                                                                                 continue 2;
 529                                                                         }
 530                                                                         break;
 531
 532                                                                 default:
 533                                                                         /**
 534                                                                         * Five- and six- byte sequences do not need being checked for here anymore
 535                                                                         */
 536                                                                         if( $utf_char > UTF8_MAX ) {
 537                                                                                 /**
 538                                                                                 * Out of the Unicode range
 539                                                                                 */
 540                                                                                 if( $utf_char[0] < "\xF8" ) {
 541                                                                                         $trailing_bytes = 3;
 542                                                                                 } elseif( $utf_char[0] < "\xFC" ) {
 543                                                                                         $trailing_bytes = 4;
 544                                                                                 } elseif( $utf_char[0] > "\xFD" ) {
 545                                                                                         $trailing_bytes = 0;
 546                                                                                 } else {
 547                                                                                         $trailing_bytes = 5;
 548                                                                                 }
 549
 550                                                                                 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
 551                                                                                 $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
 552                                                                                 $tmp_pos = $pos;
 553                                                                                 continue 2;
 554                                                                         }
 555                                                         }
 556                                                 }
 557
 558                                                 /**
 559                                                 * The char is a valid starter, move the cursor and go on
 560                                                 */
 561                                                 $pos += $utf_len;
 562                                                 continue;
 563                                         }
 564                                 } else {
 565                                         /**
 566                                         * A trailing byte came out of nowhere, we will advance the cursor
 567                                         * and treat the this byte and all following trailing bytes as if
 568                                         * each of them was a Unicode replacement char
 569                                         */
 570                                         $spn = strspn( $str, UTF8_TRAILING_BYTES, $pos );
 571                                         $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . str_repeat( UTF8_REPLACEMENT, $spn );
 572
 573                                         $pos += $spn;
 574                                         $tmp_pos = $pos;
 575                                         continue;
 576                                 }
 577
 578
 579                                 ////////////////////////////////////////////////////////////////////
 580                                 //                 STEP 1: Decompose current char                 //
 581                                 ////////////////////////////////////////////////////////////////////
 582
 583                                 /**
 584                                 * We have found a character that is either:
 585                                 *  - in the NFC_QC/NFKC_QC list
 586                                 *  - a non-starter char that is not canonically ordered
 587                                 *
 588                                 * We are going to capture the shortest UTF sequence that satisfies
 589                                 * these two conditions:
 590                                 *
 591                                 *  1 - If the sequence does not start at the begginning of the string,
 592                                 *      it must begin with a starter, and that starter must not have the
 593                                 *      NF[K]C_QC property equal to "MAYBE"
 594                                 *
 595                                 *  2 - If the sequence does not end at the end of the string, it must end
 596                                 *      with a non-starter and be immediately followed by a starter that
 597                                 *      is not on the QC list
 598                                 */
 599                                 $utf_seq = array();
 600                                 $last_cc = 0;
 601                                 $lpos = $pos;
 602                                 $pos += $utf_len;
 603
 604                                 if( isset( $decomp_map[$utf_char] ) ) {
 605                                         $_pos = 0;
 606                                         $_len = strlen( $decomp_map[$utf_char] );
 607                                         do {
 608                                                 $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
 609
 610                                                 if( isset( $_utf_len ) ) {
 611                                                         $utf_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
 612                                                         $_pos += $_utf_len;
 613                                                 } else {
 614                                                         $utf_seq[] = $decomp_map[$utf_char][$_pos];
 615                                                         ++$_pos;
 616                                                 }
 617                                         }
 618                                         while( $_pos < $_len );
 619                                 } else {
 620                                         /**
 621                                         * The char is not decomposable
 622                                         */
 623                                         $utf_seq = array( $utf_char );
 624                                 }
 625
 626
 627                                 ////////////////////////////////////////////////////////////////
 628                                 //                STEP 2: Capture the starter                 //
 629                                 ////////////////////////////////////////////////////////////////
 630
 631                                 /**
 632                                 * Check out the combining class of the first character of the UTF sequence
 633                                 */
 634                                 $k = 0;
 635                                 if( isset( $utfCombiningClass[$utf_seq[0]] ) || $qc[$utf_char] == UNICODE_QC_MAYBE ) {
 636                                         /**
 637                                         * Not a starter, inspect previous characters
 638                                         *
 639                                         * The last 8 characters are kept in a buffer so that we don't have
 640                                         * to capture them everytime. This is enough for all real-life strings
 641                                         * but even if it wasn't, we can capture characters in backward mode,
 642                                         * although it is slower than this method.
 643                                         *
 644                                         * In the following loop, $j starts at the previous buffered character
 645                                         * ($i - 1, because current character is at offset $i) and process them
 646                                         * in backward mode until we find a starter.
 647                                         *
 648                                         * $k is the index on each UTF character inside of our UTF sequence.
 649                                         * At this time, $utf_seq contains one or more characters numbered 0 to
 650                                         * n. $k starts at 0 and for each char we prepend we pre-decrement it
 651                                         * and for numbering
 652                                         */
 653                                         $starter_found = 0;
 654                                         $j_min = max(1, $i - 7 );
 655                                         for( $j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j ) {
 656                                                 $utf_char = $buffer[$j & 7];
 657                                                 $lpos -= strlen( $utf_char );
 658
 659                                                 if( isset( $decomp_map[$utf_char] ) ) {
 660                                                         /**
 661                                                         * The char is a composite, decompose for storage
 662                                                         */
 663                                                         $decomp_seq = array();
 664                                                         $_pos = 0;
 665                                                         $_len = strlen( $decomp_map[$utf_char] );
 666                                                         do {
 667                                                                 $c = $decomp_map[$utf_char][$_pos];
 668                                                                 $_utf_len =& $utf_len_mask[$c & "\xF0"];
 669
 670                                                                 if( isset( $_utf_len ) ) {
 671                                                                         $decomp_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
 672                                                                         $_pos += $_utf_len;
 673                                                                 } else {
 674                                                                         $decomp_seq[] = $c;
 675                                                                         ++$_pos;
 676                                                                 }
 677                                                         }
 678                                                         while( $_pos < $_len );
 679
 680                                                         /**
 681                                                         * Prepend the UTF sequence with our decomposed sequence
 682                                                         */
 683                                                         if( isset( $decomp_seq[1] ) ) {
 684                                                                 /**
 685                                                                 * The char expanded into several chars
 686                                                                 */
 687                                                                 $decomp_cnt = count( $decomp_seq );
 688                                                                 foreach( $decomp_seq as $decomp_i => $decomp_char ) {
 689                                                                         $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
 690                                                                 }
 691                                                                 $k -= $decomp_cnt;
 692                                                         } else {
 693                                                                 /**
 694                                                                 * Decomposed to a single char, easier to prepend
 695                                                                 */
 696                                                                 $utf_seq[--$k] = $decomp_seq[0];
 697                                                         }
 698                                                 } else {
 699                                                         $utf_seq[--$k] = $utf_char;
 700                                                 }
 701
 702                                                 if( !isset( $utfCombiningClass[$utf_seq[$k]] ) ) {
 703                                                         /**
 704                                                         * We have found our starter
 705                                                         */
 706                                                         $starter_found = 1;
 707                                                         break;
 708                                                 }
 709                                         }
 710
 711                                         if( !$starter_found && $lpos > $tmp_pos ) {
 712                                                 /**
 713                                                 * The starter was not found in the buffer, let's rewind some more
 714                                                 */
 715                                                 do {
 716                                                         /**
 717                                                         * $utf_len_mask contains the masks of both leading bytes and
 718                                                         * trailing bytes. If $utf_en > 0 then it's a leading byte,
 719                                                         * otherwise it's a trailing byte.
 720                                                         */
 721                                                         $c = $str[--$lpos];
 722                                                         $c_mask = $c & "\xF0";
 723
 724                                                         if( isset( $utf_len_mask[$c_mask] ) ) {
 725                                                                 /**
 726                                                                 * UTF byte
 727                                                                 */
 728                                                                 if( $utf_len = $utf_len_mask[$c_mask] ) {
 729                                                                         /**
 730                                                                         * UTF *leading* byte
 731                                                                         */
 732                                                                         $utf_char = substr( $str, $lpos, $utf_len );
 733
 734                                                                         if( isset( $decomp_map[$utf_char] ) ) {
 735                                                                                 /**
 736                                                                                 * Decompose the character
 737                                                                                 */
 738                                                                                 $decomp_seq = array();
 739                                                                                 $_pos = 0;
 740                                                                                 $_len = strlen( $decomp_map[$utf_char] );
 741                                                                                 do {
 742                                                                                         $c = $decomp_map[$utf_char][$_pos];
 743                                                                                         $_utf_len =& $utf_len_mask[$c & "\xF0"];
 744
 745                                                                                         if( isset( $_utf_len ) ) {
 746                                                                                                 $decomp_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
 747                                                                                                 $_pos += $_utf_len;
 748                                                                                         } else {
 749                                                                                                 $decomp_seq[] = $c;
 750                                                                                                 ++$_pos;
 751                                                                                         }
 752                                                                                 }
 753                                                                                 while( $_pos < $_len );
 754
 755                                                                                 /**
 756                                                                                 * Prepend the UTF sequence with our decomposed sequence
 757                                                                                 */
 758                                                                                 if( isset( $decomp_seq[1] ) ) {
 759                                                                                         /**
 760                                                                                         * The char expanded into several chars
 761                                                                                         */
 762                                                                                         $decomp_cnt = count( $decomp_seq );
 763                                                                                         foreach( $decomp_seq as $decomp_i => $utf_char ) {
 764                                                                                                 $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
 765                                                                                         }
 766                                                                                         $k -= $decomp_cnt;
 767                                                                                 } else {
 768                                                                                         /**
 769                                                                                         * Decomposed to a single char, easier to prepend
 770                                                                                         */
 771                                                                                         $utf_seq[--$k] = $decomp_seq[0];
 772                                                                                 }
 773                                                                         } else {
 774                                                                                 $utf_seq[--$k] = $utf_char;
 775                                                                         }
 776                                                                 }
 777                                                         } else {
 778                                                                 /**
 779                                                                 * ASCII char
 780                                                                 */
 781                                                                 $utf_seq[--$k] = $c;
 782                                                         }
 783                                                 }
 784                                                 while( $lpos > $tmp_pos );
 785                                         }
 786                                 }
 787
 788
 789                                 ////////////////////////////////////////////////////////////////
 790                                 //       STEP 3: Capture following combining modifiers        //
 791                                 ////////////////////////////////////////////////////////////////
 792
 793                                 while( $pos < $len ) {
 794                                         $c_mask = $str[$pos] & "\xF0";
 795
 796                                         if( isset( $utf_len_mask[$c_mask] ) ) {
 797                                                 if( $utf_len = $utf_len_mask[$c_mask] ) {
 798                                                         $utf_char = substr( $str, $pos, $utf_len );
 799                                                 } else {
 800                                                         /**
 801                                                         * A trailing byte came out of nowhere
 802                                                         *
 803                                                         * Trailing bytes are replaced with Unicode replacement chars,
 804                                                         * we will just ignore it for now, break out of the loop
 805                                                         * as if it was a starter (replacement chars ARE starters)
 806                                                         * and let the next loop replace it
 807                                                         */
 808                                                         break;
 809                                                 }
 810
 811                                                 if( isset( $utfCombiningClass[$utf_char] ) || isset( $qc[$utf_char] ) ) {
 812                                                         /**
 813                                                         * Combining character, add it to the sequence and move the cursor
 814                                                         */
 815                                                         if( isset( $decomp_map[$utf_char] ) ) {
 816                                                                 /**
 817                                                                 * Decompose the character
 818                                                                 */
 819                                                                 $_pos = 0;
 820                                                                 $_len = strlen( $decomp_map[$utf_char] );
 821                                                                 do {
 822                                                                         $c = $decomp_map[$utf_char][$_pos];
 823                                                                         $_utf_len =& $utf_len_mask[$c & "\xF0"];
 824
 825                                                                         if( isset( $_utf_len ) ) {
 826                                                                                 $utf_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
 827                                                                                 $_pos += $_utf_len;
 828                                                                         } else {
 829                                                                                 $utf_seq[] = $c;
 830                                                                                 ++$_pos;
 831                                                                         }
 832                                                                 }
 833                                                                 while( $_pos < $_len );
 834                                                         } else {
 835                                                                 $utf_seq[] = $utf_char;
 836                                                         }
 837
 838                                                         $pos += $utf_len;
 839                                                 } else {
 840                                                         /**
 841                                                         * Combining class 0 and no QC, break out of the loop
 842                                                         *
 843                                                         * Note: we do not know if that character is valid. If
 844                                                         * it's not, the next iteration will replace it
 845                                                         */
 846                                                         break;
 847                                                 }
 848                                         } else {
 849                                                 /**
 850                                                 * ASCII chars are starters
 851                                                 */
 852                                                 break;
 853                                         }
 854                                 }
 855
 856
 857                                 ////////////////////////////////////////////////////////////////
 858                                 //                  STEP 4: Sort and combine                  //
 859                                 ////////////////////////////////////////////////////////////////
 860
 861                                 /**
 862                                 * Here we sort...
 863                                 */
 864                                 $k_max = $k + count( $utf_seq );
 865                                 if( !$k && $k_max == 1 ) {
 866                                         /**
 867                                         * There is only one char in the UTF sequence, add it then
 868                                         * jump to the next iteration of main loop
 869                                         *
 870                                         * Note: the two commented lines below can be enabled under PHP5
 871                                         * for a very small performance gain in most cases
 872                                         */
 873 //                                      if( substr_compare( $str, $utf_seq[0], $lpos, $pos - $lpos ) ) {
 874                                                 $tmp .= substr( $str, $tmp_pos, $lpos - $tmp_pos ) . $utf_seq[0];
 875                                                 $tmp_pos = $pos;
 876 //                                      }
 877
 878                                         continue;
 879                                 }
 880
 881                                 /**
 882                                 * ...there we combine
 883                                 */
 884                                 if( isset( $utfCombiningClass[$utf_seq[$k]] ) ) {
 885                                         $starter = $nf_seq = '';
 886                                 } else {
 887                                         $starter = $utf_seq[$k++];
 888                                         $nf_seq = '';
 889                                 }
 890                                 $utf_sort = array();
 891
 892                                 /**
 893                                 * We add an empty char at the end of the UTF char sequence.
 894                                 * It will act as a starter and trigger the sort/combine routine
 895                                 * at the end of the string without altering it
 896                                 */
 897                                 $utf_seq[] = '';
 898
 899                                 do {
 900                                         $utf_char = $utf_seq[$k++];
 901
 902                                         if( isset( $utfCombiningClass[$utf_char] ) ) {
 903                                                 $utf_sort[$utfCombiningClass[$utf_char]][] = $utf_char;
 904                                         } else {
 905                                                 if( empty( $utf_sort ) ) {
 906                                                         /**
 907                                                         * No combining characters... check for a composite
 908                                                         * of the two starters
 909                                                         */
 910                                                         if( isset( $utfCanonicalComp[$starter . $utf_char] ) ) {
 911                                                                 /**
 912                                                                 * Good ol' composite character
 913                                                                 */
 914                                                                 $starter = $utfCanonicalComp[$starter . $utf_char];
 915                                                         } elseif( isset( $utfJamoType[$utf_char] ) ) {
 916                                                                 /**
 917                                                                 * Current char is a composable jamo
 918                                                                 */
 919                                                                 if( isset( $utfJamoType[$starter] )
 920                                                                  && $utfJamoType[$starter] == UNICODE_JAMO_L
 921                                                                  && $utfJamoType[$utf_char] == UNICODE_JAMO_V ) {
 922                                                                         /**
 923                                                                         * We have a L jamo followed by a V jamo, we are going
 924                                                                         * to prefetch the next char to see if it's a T jamo
 925                                                                         */
 926                                                                         if( isset( $utfJamoType[$utf_seq[$k]] ) && $utfJamoType[$utf_seq[$k]] == UNICODE_JAMO_T ) {
 927                                                                                 /**
 928                                                                                 * L+V+T jamos, combine to a LVT Hangul syllable
 929                                                                                 * ($k is incremented)
 930                                                                                 */
 931                                                                                 $cp = $utfJamoIndex[$starter] + $utfJamoIndex[$utf_char] + $utfJamoIndex[$utf_seq[$k]];
 932
 933                                                                                 ++$k;
 934                                                                         } else {
 935                                                                                 /**
 936                                                                                 * L+V jamos, combine to a LV Hangul syllable
 937                                                                                 */
 938                                                                                 $cp = $utfJamoIndex[$starter] + $utfJamoIndex[$utf_char];
 939                                                                         }
 940
 941                                                                         $starter = chr( 0xE0 | ( $cp >> 12 ) ) . chr( 0x80 | ( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 | ( $cp & 0x3F ) );
 942                                                                 } else {
 943                                                                         /**
 944                                                                         * Non-composable jamo, just add it to the sequence
 945                                                                         */
 946                                                                         $nf_seq .= $starter;
 947                                                                         $starter = $utf_char;
 948                                                                 }
 949                                                         } else {
 950                                                                 /**
 951                                                                 * No composite, just add the first starter to the sequence
 952                                                                 * then continue with the other one
 953                                                                 */
 954                                                                 $nf_seq .= $starter;
 955                                                                 $starter = $utf_char;
 956                                                         }
 957                                                 } else {
 958                                                         ksort( $utf_sort );
 959
 960                                                         /**
 961                                                         * For each class of combining characters
 962                                                         */
 963                                                         foreach( $utf_sort as $cc => $utf_chars ) {
 964                                                                 $j = 0;
 965
 966                                                                 do {
 967                                                                         /**
 968                                                                         * Look for a composite
 969                                                                         */
 970                                                                         if( isset( $utfCanonicalComp[$starter . $utf_chars[$j]] ) ) {
 971                                                                                 /**
 972                                                                                 * Found a composite, replace the starter
 973                                                                                 */
 974                                                                                 $starter = $utfCanonicalComp[$starter . $utf_chars[$j]];
 975                                                                                 unset( $utf_sort[$cc][$j] );
 976                                                                         } else {
 977                                                                                 /**
 978                                                                                 * No composite, all following characters in that
 979                                                                                 * class are blocked
 980                                                                                 */
 981                                                                                 break;
 982                                                                         }
 983                                                                 }
 984                                                                 while( isset( $utf_sort[$cc][++$j] ) );
 985                                                         }
 986
 987                                                         /**
 988                                                         * Add the starter to the normalized sequence, followed by
 989                                                         * non-starters in canonical order
 990                                                         */
 991                                                         $nf_seq .= $starter;
 992                                                         foreach( $utf_sort as $utf_chars ) {
 993                                                                 if( !empty( $utf_chars ) ) {
 994                                                                         $nf_seq .= implode( '', $utf_chars );
 995                                                                 }
 996                                                         }
 997
 998                                                         /**
 999                                                         * Reset the array and go on
1000                                                         */
1001                                                         $utf_sort = array();
1002                                                         $starter = $utf_char;
1003                                                 }
1004                                         }
1005                                 }
1006                                 while( $k <= $k_max );
1007
1008                                 $tmp .= substr( $str, $tmp_pos, $lpos - $tmp_pos ) . $nf_seq;
1009                                 $tmp_pos = $pos;
1010                         } else {
1011                                 /**
1012                                 * Only a ASCII char can make the program get here
1013                                 *
1014                                 * First we skip the current byte with ++$pos, then we quickly
1015                                 * skip following ASCII chars with strspn().
1016                                 *
1017                                 * The first two "if"'s here can be removed, with the consequences
1018                                 * of being faster on latin text (lots of ASCII) and slower on
1019                                 * multi-byte text (where the only ASCII chars are spaces and punctuation)
1020                                 */
1021                                 if( ++$pos != $len ) {
1022                                         if( $str[$pos] < "\x80" ) {
1023                                                 $pos += strspn( $str, UTF8_ASCII_RANGE, ++$pos );
1024                                                 $buffer[++$i & 7] = $str[$pos - 1];
1025                                         } else {
1026                                                 $buffer[++$i & 7] = $c;
1027                                         }
1028                                 }
1029                         }
1030                 }
1031                 while( $pos < $len );
1032
1033                 /**
1034                 * Now is time to return the string
1035                 */
1036                 if( $tmp_pos ) {
1037                         /**
1038                         * If the $tmp_pos cursor is not at the beggining of the string then at least
1039                         * one character was not in normal form. Replace $str with the fixed version
1040                         */
1041                         if( $tmp_pos == $len ) {
1042                                 /**
1043                                 * The $tmp_pos cursor is at the end of $str, therefore $tmp holds the
1044                                 * whole $str
1045                                 */
1046                                 return $tmp;
1047                         } else {
1048                                 /**
1049                                 * The rightmost chunk of $str has not been appended to $tmp yet
1050                                 */
1051                                 return $tmp . substr( $str, $tmp_pos );
1052                         }
1053                 }
1054
1055                 /**
1056                 * The string was already in normal form
1057                 */
1058                 return $str;
1059         }
1060
1061         /**
1062         * Decompose a UTF string
1063         *
1064         * @param        string  $str            UTF string
1065         * @param        integer $pos            Position of the first UTF char (in bytes)
1066         * @param        integer $len            Length of the string (in bytes)
1067         * @param        array   $decomp_map     Decomposition mapping, passed by reference but never modified
1068         * @return       string                          The string, decomposed and sorted canonically
1069         *
1070         * @access       private
1071         */
1072         function decompose( $str, $pos, $len, &$decomp_map ) {
1073                 global $utfCombiningClass, $utfCanonicalDecomp;
1074
1075                 /**
1076                 * UTF char length array
1077                 */
1078                 $utf_len_mask = array(
1079                         /**
1080                         * Leading bytes masks
1081                         */
1082                         "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
1083
1084                         /**
1085                         * Trailing bytes masks
1086                         */
1087                         "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
1088                 );
1089
1090                 /**
1091                 * Some extra checks are triggered on the first byte of a UTF sequence
1092                 */
1093                 $extra_check = array(
1094                         "\xED"=>1, "\xEF"=>1, "\xC0"=>1, "\xC1"=>1, "\xE0"=>1, "\xF0"=>1,
1095                         "\xF4"=>1, "\xF5"=>1, "\xF6"=>1, "\xF7"=>1, "\xF8"=>1, "\xF9"=>1,
1096                         "\xFA"=>1, "\xFB"=>1, "\xFC"=>1, "\xFD"=>1, "\xFE"=>1, "\xFF"=>1
1097                 );
1098
1099                 /**
1100                 * These masks are used to check if a UTF sequence is well formed.
1101                 * Here are the only 3 lengths we acknowledge:
1102                 *   - 2-byte: 110? ???? 10?? ????
1103                 *   - 3-byte: 1110 ???? 10?? ???? 10?? ????
1104                 *   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
1105                 *
1106                 * Note that 5- and 6- byte sequences are automatically discarded
1107                 */
1108                 $utf_validation_mask = array(
1109                         2       =>      "\xE0\xC0",
1110                         3       =>      "\xF0\xC0\xC0",
1111                         4       =>      "\xF8\xC0\xC0\xC0"
1112                 );
1113                 $utf_validation_check = array(
1114                         2       =>      "\xC0\x80",
1115                         3       =>      "\xE0\x80\x80",
1116                         4       =>      "\xF0\x80\x80\x80"
1117                 );
1118
1119                 $tmp = '';
1120                 $starter_pos = $pos;
1121                 $tmp_pos = $last_cc = $sort = $dump = 0;
1122                 $utf_sort = array();
1123
1124
1125                 ////////////////////////////////////////////////////////////////////////
1126                 //                             Main loop                              //
1127                 ////////////////////////////////////////////////////////////////////////
1128
1129                 do {
1130                         ////////////////////////////////////////////////////////////////////
1131                         //                STEP 0: Capture the current char                //
1132                         ////////////////////////////////////////////////////////////////////
1133
1134                         $cur_mask = $str[$pos] & "\xF0";
1135                         if( isset( $utf_len_mask[$cur_mask] ) ) {
1136                                 if( $utf_len = $utf_len_mask[$cur_mask] ) {
1137                                         /**
1138                                         * Multibyte char
1139                                         */
1140                                         $utf_char = substr( $str, $pos, $utf_len );
1141                                         $pos += $utf_len;
1142                                 } else {
1143                                         /**
1144                                         * A trailing byte came out of nowhere, we will treat it and all
1145                                         * following trailing bytes as if each of them was a Unicode
1146                                         * replacement char and we will advance the cursor
1147                                         */
1148                                         $spn = strspn( $str, UTF8_TRAILING_BYTES, $pos );
1149
1150                                         if( $dump ) {
1151                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1152
1153                                                 /**
1154                                                 * Dump combiners
1155                                                 */
1156                                                 if( !empty( $utf_sort ) ) {
1157                                                         if( $sort ) {
1158                                                                 ksort( $utf_sort );
1159                                                         }
1160
1161                                                         foreach( $utf_sort as $utf_chars ) {
1162                                                                 $tmp .= implode( '', $utf_chars );
1163                                                         }
1164                                                 }
1165
1166                                                 $tmp .= str_repeat( UTF8_REPLACEMENT, $spn );
1167                                                 $dump = $sort = 0;
1168                                         } else {
1169                                                 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . str_repeat( UTF8_REPLACEMENT, $spn );
1170                                         }
1171
1172                                         $pos += $spn;
1173                                         $tmp_pos = $starter_pos = $pos;
1174
1175                                         $utf_sort = array();
1176                                         $last_cc = 0;
1177
1178                                         continue;
1179                                 }
1180
1181
1182                                 ////////////////////////////////////////////////////////////////////
1183                                 //          STEP 1: Decide what to do with current char           //
1184                                 ////////////////////////////////////////////////////////////////////
1185
1186                                 /**
1187                                 * Now, in that order:
1188                                 *  - check if that character is decomposable
1189                                 *  - check if that character is a non-starter
1190                                 *  - check if that character requires extra checks to be performed
1191                                 */
1192                                 if( isset( $decomp_map[$utf_char] ) ) {
1193                                         /**
1194                                         * Decompose the char
1195                                         */
1196                                         $_pos = 0;
1197                                         $_len = strlen( $decomp_map[$utf_char] );
1198
1199                                         do {
1200                                                 $c = $decomp_map[$utf_char][$_pos];
1201                                                 $_utf_len =& $utf_len_mask[$c & "\xF0"];
1202
1203                                                 if( isset( $_utf_len ) ) {
1204                                                         $_utf_char = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
1205                                                         $_pos += $_utf_len;
1206
1207                                                         if( isset( $utfCombiningClass[$_utf_char] ) ) {
1208                                                                 /**
1209                                                                 * The character decomposed to a non-starter, buffer it for sorting
1210                                                                 */
1211                                                                 $utf_sort[$utfCombiningClass[$_utf_char]][] = $_utf_char;
1212
1213                                                                 if( $utfCombiningClass[$_utf_char] < $last_cc ) {
1214                                                                         /**
1215                                                                         * Not canonically ordered, will require sorting
1216                                                                         */
1217                                                                         $sort = $dump = 1;
1218                                                                 } else {
1219                                                                         $dump = 1;
1220                                                                         $last_cc = $utfCombiningClass[$_utf_char];
1221                                                                 }
1222                                                         } else {
1223                                                                 /**
1224                                                                 * This character decomposition contains a starter,
1225                                                                 * dump the buffer and continue
1226                                                                 */
1227                                                                 if( $dump ) {
1228                                                                         $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1229
1230                                                                         /**
1231                                                                         * Dump combiners
1232                                                                         */
1233                                                                         if( !empty( $utf_sort ) ) {
1234                                                                                 if( $sort ) {
1235                                                                                         ksort( $utf_sort );
1236                                                                                 }
1237
1238                                                                                 foreach( $utf_sort as $utf_chars ) {
1239                                                                                         $tmp .= implode( '', $utf_chars );
1240                                                                                 }
1241                                                                         }
1242
1243                                                                         $tmp .= $_utf_char;
1244                                                                         $dump = $sort = 0;
1245                                                                 } else {
1246                                                                         $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos ) . $_utf_char;
1247                                                                 }
1248
1249                                                                 $tmp_pos = $starter_pos = $pos;
1250                                                                 $utf_sort = array();
1251                                                                 $last_cc = 0;
1252                                                         }
1253                                                 } else {
1254                                                         /**
1255                                                         * This character decomposition contains an ASCII char,
1256                                                         * which is a starter. Dump the buffer and continue
1257                                                         */
1258                                                         ++$_pos;
1259                                                         if( $dump ) {
1260                                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1261
1262                                                                 /**
1263                                                                 * Dump combiners
1264                                                                 */
1265                                                                 if( !empty( $utf_sort ) ) {
1266                                                                         if( $sort ) {
1267                                                                                 ksort( $utf_sort );
1268                                                                         }
1269
1270                                                                         foreach( $utf_sort as $utf_chars ) {
1271                                                                                 $tmp .= implode( '', $utf_chars );
1272                                                                         }
1273                                                                 }
1274
1275                                                                 $tmp .= $c;
1276                                                                 $dump = $sort = 0;
1277                                                         } else {
1278                                                                 $tmp .= substr( $str, $tmp_pos, $pos - $utf_len - $tmp_pos ) . $c;
1279                                                         }
1280
1281                                                         $tmp_pos = $starter_pos = $pos;
1282                                                         $utf_sort = array();
1283                                                         $last_cc = 0;
1284                                                 }
1285                                         }
1286                                         while( $_pos < $_len );
1287                                 } elseif( isset( $utfCombiningClass[$utf_char] ) ) {
1288                                         /**
1289                                         * Combining character
1290                                         */
1291                                         if( $utfCombiningClass[$utf_char] < $last_cc ) {
1292                                                 /**
1293                                                 * Not in canonical order
1294                                                 */
1295                                                 $sort = $dump = 1;
1296                                         } else {
1297                                                 $last_cc = $utfCombiningClass[$utf_char];
1298                                         }
1299
1300                                         $utf_sort[$utfCombiningClass[$utf_char]][] = $utf_char;
1301                                 } else {
1302                                         /**
1303                                         * Non-decomposable starter, check out if it's a Hangul syllable
1304                                         */
1305                                         if( $utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST ) {
1306                                                 /**
1307                                                 * Nope, regular UTF char, check that we have the correct number of trailing bytes
1308                                                 */
1309                                                 if( ( $utf_char & $utf_validation_mask[$utf_len] ) != $utf_validation_check[$utf_len] ) {
1310                                                         /**
1311                                                         * Current char isn't well-formed or legal: either one or
1312                                                         * several trailing bytes are missing, or the Unicode char
1313                                                         * has been encoded in a five- or six- byte sequence
1314                                                         */
1315                                                         if( $utf_char[0] >= "\xF8" ) {
1316                                                                 if( $utf_char[0] < "\xF8" ) {
1317                                                                         $trailing_bytes = 3;
1318                                                                 } elseif( $utf_char[0] < "\xFC" ) {
1319                                                                         $trailing_bytes = 4;
1320                                                                 }
1321                                                                 if( $utf_char[0] > "\xFD" ) {
1322                                                                         $trailing_bytes = 0;
1323                                                                 } else {
1324                                                                         $trailing_bytes = 5;
1325                                                                 }
1326                                                         } else {
1327                                                                 $trailing_bytes = $utf_len - 1;
1328                                                         }
1329
1330                                                         /**
1331                                                         * Move the cursor back to its original position then advance
1332                                                         * it to the position it should be at
1333                                                         */
1334                                                         $pos -= $utf_len;
1335                                                         $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1336
1337                                                         if( !empty( $utf_sort ) ) {
1338                                                                 ksort( $utf_sort );
1339
1340                                                                 foreach( $utf_sort as $utf_chars ) {
1341                                                                         $tmp .= implode( '', $utf_chars );
1342                                                                 }
1343                                                                 $utf_sort = array();
1344                                                         }
1345
1346                                                         $tmp .= UTF8_REPLACEMENT;
1347                                                         $dump = $sort = 0;
1348
1349                                                         $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
1350                                                         $tmp_pos = $pos;
1351                                                         continue;
1352                                                 }
1353
1354                                                 if( isset( $extra_check[$utf_char[0]] ) ) {
1355                                                         switch( $utf_char[0] ) {
1356                                                                 /**
1357                                                                 * Note: 0xED is quite common in Korean
1358                                                                 */
1359                                                                 case "\xED":
1360                                                                         if( $utf_char >= "\xED\xA0\x80" ) {
1361                                                                                 /**
1362                                                                                 * Surrogates (0xD800..0xDFFF) are not allowed in UTF-8
1363                                                                                 * (UTF sequence 0xEDA080..0xEDBFBF)
1364                                                                                 */
1365                                                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1366
1367                                                                                 if( !empty( $utf_sort ) ) {
1368                                                                                         ksort( $utf_sort );
1369
1370                                                                                         foreach( $utf_sort as $utf_chars ) {
1371                                                                                                 $tmp .= implode( '', $utf_chars );
1372                                                                                         }
1373                                                                                         $utf_sort = array();
1374                                                                                 }
1375
1376                                                                                 $tmp .= UTF8_REPLACEMENT;
1377                                                                                 $dump = $sort = 0;
1378
1379                                                                                 $tmp_pos = $starter_pos = $pos;
1380                                                                                 continue 2;
1381                                                                         }
1382                                                                         break;
1383
1384                                                                 /**
1385                                                                 * Note: 0xEF is quite common in Japanese
1386                                                                 */
1387                                                                 case "\xEF":
1388                                                                         if( $utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF" ) {
1389                                                                                 /**
1390                                                                                 * 0xFFFE and 0xFFFF are explicitly disallowed
1391                                                                                 * (UTF sequence 0xEFBFBE..0xEFBFBF)
1392                                                                                 */
1393                                                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1394
1395                                                                                 if( !empty( $utf_sort ) ) {
1396                                                                                         ksort( $utf_sort );
1397
1398                                                                                         foreach( $utf_sort as $utf_chars ) {
1399                                                                                                 $tmp .= implode( '', $utf_chars );
1400                                                                                         }
1401                                                                                         $utf_sort = array();
1402                                                                                 }
1403
1404                                                                                 $tmp .= UTF8_REPLACEMENT;
1405                                                                                 $dump = $sort = 0;
1406
1407                                                                                 $tmp_pos = $starter_pos = $pos;
1408                                                                                 continue 2;
1409                                                                         }
1410                                                                         break;
1411
1412                                                                 case "\xC0":
1413                                                                 case "\xC1":
1414                                                                         if( $utf_char <= "\xC1\xBF" ) {
1415                                                                                 /**
1416                                                                                 * Overlong sequence: Unicode char 0x00..0x7F encoded as a
1417                                                                                 * double-byte UTF char
1418                                                                                 */
1419                                                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1420
1421                                                                                 if( !empty( $utf_sort ) ) {
1422                                                                                         ksort( $utf_sort );
1423
1424                                                                                         foreach( $utf_sort as $utf_chars ) {
1425                                                                                                 $tmp .= implode( '', $utf_chars );
1426                                                                                         }
1427                                                                                         $utf_sort = array();
1428                                                                                 }
1429
1430                                                                                 $tmp .= UTF8_REPLACEMENT;
1431                                                                                 $dump = $sort = 0;
1432
1433                                                                                 $tmp_pos = $starter_pos = $pos;
1434                                                                                 continue 2;
1435                                                                         }
1436                                                                         break;
1437
1438                                                                 case "\xE0":
1439                                                                         if( $utf_char <= "\xE0\x9F\xBF" ) {
1440                                                                                 /**
1441                                                                                 * Unicode char 0x0000..0x07FF encoded in 3 bytes
1442                                                                                 */
1443                                                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1444
1445                                                                                 if( !empty( $utf_sort ) ) {
1446                                                                                         ksort( $utf_sort );
1447
1448                                                                                         foreach( $utf_sort as $utf_chars ) {
1449                                                                                                 $tmp .= implode( '', $utf_chars );
1450                                                                                         }
1451                                                                                         $utf_sort = array();
1452                                                                                 }
1453
1454                                                                                 $tmp .= UTF8_REPLACEMENT;
1455                                                                                 $dump = $sort = 0;
1456
1457                                                                                 $tmp_pos = $starter_pos = $pos;
1458                                                                                 continue 2;
1459                                                                         }
1460                                                                         break;
1461
1462                                                                 case "\xF0":
1463                                                                         if( $utf_char <= "\xF0\x8F\xBF\xBF" ) {
1464                                                                                 /**
1465                                                                                 * Unicode char 0x0000..0xFFFF encoded in 4 bytes
1466                                                                                 */
1467                                                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1468
1469                                                                                 if( !empty( $utf_sort ) ) {
1470                                                                                         ksort( $utf_sort );
1471
1472                                                                                         foreach( $utf_sort as $utf_chars ) {
1473                                                                                                 $tmp .= implode( '', $utf_chars );
1474                                                                                         }
1475                                                                                         $utf_sort = array();
1476                                                                                 }
1477
1478                                                                                 $tmp .= UTF8_REPLACEMENT;
1479                                                                                 $dump = $sort = 0;
1480
1481                                                                                 $tmp_pos = $starter_pos = $pos;
1482                                                                                 continue 2;
1483                                                                         }
1484                                                                         break;
1485
1486                                                                 default:
1487                                                                         if( $utf_char > UTF8_MAX ) {
1488                                                                                 /**
1489                                                                                 * Out of the Unicode range
1490                                                                                 */
1491                                                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1492
1493                                                                                 if( !empty( $utf_sort ) ) {
1494                                                                                         ksort( $utf_sort );
1495
1496                                                                                         foreach( $utf_sort as $utf_chars ) {
1497                                                                                                 $tmp .= implode( '', $utf_chars );
1498                                                                                         }
1499                                                                                         $utf_sort = array();
1500                                                                                 }
1501
1502                                                                                 $tmp .= UTF8_REPLACEMENT;
1503                                                                                 $dump = $sort = 0;
1504
1505                                                                                 $tmp_pos = $starter_pos = $pos;
1506                                                                                 continue 2;
1507                                                                         }
1508                                                         }
1509                                                 }
1510                                         } else {
1511                                                 /**
1512                                                 * Hangul syllable
1513                                                 */
1514                                                 $idx = ( ( ( ord( $utf_char[0] ) & 0x0F ) << 12 ) | ( ( ord( $utf_char[1] ) & 0x3F ) << 6 ) | ( ord( $utf_char[2] ) & 0x3F ) ) - UNICODE_HANGUL_SBASE;
1515
1516                                                 /**
1517                                                 * LIndex can only range from 0 to 18, therefore it cannot influence
1518                                                 * the first two bytes of the L Jamo, which allows us to hardcode
1519                                                 * them (based on LBase).
1520                                                 *
1521                                                 * The same goes for VIndex, but for TIndex there's a catch: the value
1522                                                 * of the third byte could exceed 0xBF and we would have to increment
1523                                                 * the second byte
1524                                                 */
1525                                                 if( $tIndex = $idx % UNICODE_HANGUL_TCOUNT ) {
1526                                                         if( $tIndex < 25 ) {
1527                                                                 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1528                                                                 $utf_char[8] = chr( 0xA7 + $tIndex );
1529                                                         } else {
1530                                                                 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1531                                                                 $utf_char[8] = chr( 0x67 + $tIndex );
1532                                                         }
1533                                                 } else {
1534                                                         $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1535                                                 }
1536
1537                                                 $utf_char[2] = chr( 0x80 + ( int ) ( $idx / UNICODE_HANGUL_NCOUNT ) );
1538                                                 $utf_char[5] = chr( 0xA1 + ( int ) ( ( $idx % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT ) );
1539
1540
1541                                                 /**
1542                                                 * Just like other decompositions, the resulting Jamos must
1543                                                 * be dumped to the tmp string
1544                                                 */
1545                                                 $dump = 1;
1546                                         }
1547
1548                                         /**
1549                                         * Do we need to dump stuff to the tmp string?
1550                                         */
1551                                         if( $dump ) {
1552                                                 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1553
1554                                                 /**
1555                                                 * Dump combiners
1556                                                 */
1557                                                 if( !empty( $utf_sort ) ) {
1558                                                         if( $sort ) {
1559                                                                 ksort( $utf_sort );
1560                                                         }
1561
1562                                                         foreach( $utf_sort as $utf_chars ) {
1563                                                                 $tmp .= implode( '', $utf_chars );
1564                                                         }
1565                                                 }
1566
1567                                                 $tmp .= $utf_char;
1568                                                 $dump = $sort = 0;
1569                                                 $tmp_pos = $pos;
1570                                         }
1571
1572                                         $last_cc = 0;
1573                                         $utf_sort = array();
1574                                         $starter_pos = $pos;
1575                                 }
1576                         } else {
1577                                 /**
1578                                 * ASCII char, which happens to be a starter (as any other ASCII char)
1579                                 */
1580                                 if( $dump ) {
1581                                         $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1582
1583                                         /**
1584                                         * Dump combiners
1585                                         */
1586                                         if( !empty( $utf_sort ) ) {
1587                                                 if( $sort ) {
1588                                                         ksort( $utf_sort );
1589                                                 }
1590
1591                                                 foreach( $utf_sort as $utf_chars ) {
1592                                                         $tmp .= implode( '', $utf_chars );
1593                                                 }
1594                                         }
1595
1596                                         $tmp .= $str[$pos];
1597                                         $dump = $sort = 0;
1598                                         $tmp_pos = ++$pos;
1599
1600                                         $pos += strspn( $str, UTF8_ASCII_RANGE, $pos );
1601                                 } else {
1602                                         $pos += strspn( $str, UTF8_ASCII_RANGE, ++$pos );
1603                                 }
1604
1605                                 $last_cc = 0;
1606                                 $utf_sort = array();
1607                                 $starter_pos = $pos;
1608                         }
1609                 }
1610                 while( $pos < $len );
1611
1612                 /**
1613                 * Now is time to return the string
1614                 */
1615                 if( $dump ) {
1616                         $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1617
1618                         /**
1619                         * Dump combiners
1620                         */
1621                         if( !empty( $utf_sort ) ) {
1622                                 if( $sort ) {
1623                                         ksort( $utf_sort );
1624                                 }
1625
1626                                 foreach( $utf_sort as $utf_chars ) {
1627                                         $tmp .= implode( '', $utf_chars );
1628                                 }
1629                         }
1630
1631                         return $tmp;
1632
1633                 } elseif( $tmp_pos ) {
1634                         /**
1635                         * If the $tmp_pos cursor was moved then at least one character was not in
1636                         * normal form. Replace $str with the fixed version
1637                         */
1638                         if( $tmp_pos == $len ) {
1639                                 /**
1640                                 * The $tmp_pos cursor is at the end of $str, therefore $tmp holds
1641                                 * the whole $str
1642                                 */
1643                                 return $tmp;
1644                         } else {
1645                                 /**
1646                                 * The rightmost chunk of $str has not been appended to $tmp yet
1647                                 */
1648                                 return $tmp . substr( $str, $tmp_pos );
1649                         }
1650                 }
1651
1652                 /**
1653                 * The string was already in normal form
1654                 */
1655                 return $str;
1656         }
1657 }
1658
1659 }