(bug 5303) Merge UtfNormal rewrite. Patch by Ludovic Arnaud (YuviPanda). This is...
[lhc/web/wiklou.git] / includes / normal / UtfNormal.php
1 <?php
2 /**
3 * Unicode normalization routines
4 *
5 * Copyright (C) 2006 Ludovic ARNAUD <ludovic.arnaud@gmail.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 * http://www.gnu.org/copyleft/gpl.html
21 *
22 * @author Ludovic ARNAUD <ludovic.arnaud@gmail.com>
23 * @license http://www.gnu.org/licenses/gpl.txt
24 * @package UtfNormal
25 */
26
27 require_once ('UtfNormalDefines.php');
28
29 if( function_exists( 'utf8_normalize' ) ) {
30
31 ////////////////////////////////////////////////////////////////////////////////
32 // Wrapper for the utfnormal extension, ICU wrapper //
33 ////////////////////////////////////////////////////////////////////////////////
34
35 /**
36 * UtfNormal class for the utfnormal extension
37 *
38 * @ignore
39 */
40 class UtfNormal {
41 function cleanUp( $str ) {
42 /**
43 * The string below is the list of all autorized characters, sorted by
44 * frequency in latin text
45 */
46 $pos = strspn(
47 $str,
48 "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D"
49 );
50
51 if( !isset( $str[$pos] ) ) {
52 /**
53 * ASCII strings with no special chars return immediately
54 */
55 return $str;
56 }
57
58 /**
59 * Check if there is potentially a 0xFFFE or 0xFFFF char (UTF sequence
60 * 0xEFBFBE or 0xEFBFBF) and replace them
61 *
62 * Note: we start searching at position $pos
63 */
64 if( is_int( strpos( $str, "\xEF\xBF", $pos ) ) ) {
65 $str = str_replace(
66 array( "\xEF\xBF\xBE", "\xEF\xBF\xBF" ),
67 array( UTF8_REPLACEMENT, UTF8_REPLACEMENT ),
68 $str
69 );
70 }
71
72 /**
73 * Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
74 *
75 * We replace those characters with a 0xFF byte, which is illegal in
76 * UTF-8 and will in turn be replaced with a Unicode replacement char
77 */
78 $str = strtr(
79 $str,
80 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
81 "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
82 );
83
84 /**
85 * As per the original implementation, "the UnicodeString constructor fails
86 * if the string ends with a head byte". Therefore, if the string ends with
87 * a leading byte we replace it with 0xFF, which is illegal too and will be
88 * replaced with a Unicode replacement character
89 */
90 if( substr( $str, -1 ) >= "\xC0" ) {
91 $str[strlen($str) - 1] = "\xFF";
92 }
93
94 return utf8_normalize( $str, UNORM_NFC );
95 }
96
97 function toNFC( $str ) {
98 return utf8_normalize( $str, UNORM_NFC );
99 }
100
101 function toNFKC( $str ) {
102 return utf8_normalize( $str, UNORM_NFKC );
103 }
104
105 function toNFD( $str ) {
106 return utf8_normalize( $str, UNORM_NFD );
107 }
108
109 function toNFKD( $str ) {
110 return utf8_normalize( $str, UNORM_NFKD );
111 }
112 }
113
114 ////////////////////////////////////////////////////////////////////////////////
115 // End of the ICU wrapper //
116 ////////////////////////////////////////////////////////////////////////////////
117
118
119 } else {
120
121
122 ////////////////////////////////////////////////////////////////////////////////
123 // This block will NOT be loaded if the utfnormal extension is //
124 ////////////////////////////////////////////////////////////////////////////////
125
126 /**
127 * Unset global variables
128 */
129 unset( $GLOBALS['utfJamoIndex'], $GLOBALS['utfJamoType'], $GLOBALS['utfCheckNFC'], $GLOBALS['utfCombiningClass'], $GLOBALS['utfCanonicalComp'], $GLOBALS['utfCanonicalDecomp'], $GLOBALS['utfCheckNFKC'], $GLOBALS['utfCompatibilityDecomp'] );
130
131 /**
132 * NFC_QC and NFKC_QC values
133 */
134 define( 'UNICODE_QC_MAYBE', 0 );
135 define( 'UNICODE_QC_NO', 1 );
136
137 /**
138 * Contains all the ASCII characters appearing in UTF-8, sorted by frequency
139 */
140 define( 'UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F" );
141
142 /**
143 * Contains all the tail bytes that can appear in the composition of a UTF-8 char
144 */
145 define( 'UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A" );
146
147 /**
148 * Unicode normalization routines
149 *
150 * A copy of reports of bugs related to this class can be sent to the author directly
151 *
152 * @package UtfNormal
153 */
154 class UtfNormal {
155 /**
156 * Validate, cleanup and normalize a string
157 *
158 * The ultimate convenience function! Clean up invalid UTF-8 sequences,
159 * and convert to Normal Form C, canonical composition.
160 *
161 * @param string $str The dirty string
162 * @return string The same string, all shiny and cleaned-up
163 */
164 function cleanup( $str ) {
165 /**
166 * The string below is the list of all autorized characters, sorted by
167 * frequency in latin text
168 */
169 $pos = strspn( $str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D" );
170 $len = strlen( $str );
171
172 if( $pos == $len ) {
173 /**
174 * ASCII strings with no special chars return immediately
175 */
176 return $str;
177 }
178
179 /**
180 * Note: we do not check for $GLOBALS['utfCanonicalDecomp']. It is assumed
181 * they are always loaded together
182 */
183 if( !isset( $GLOBALS['utfCheckNFC'] ) ) {
184 include( 'UtfNormalData.inc' );
185 }
186
187 /**
188 * Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
189 *
190 * We replace those characters with a 0xFF byte, which is illegal in
191 * UTF-8 and will in turn be replaced with a UTF replacement char
192 */
193 return UtfNormal::recompose(
194 strtr(
195 $str,
196 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
197 "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
198 ),
199 $pos, $len, $GLOBALS['utfCheckNFC'], $GLOBALS['utfCanonicalDecomp']
200 );
201 }
202
203 /**
204 * Validate and normalize a UTF string to NFC
205 *
206 * @param string $str Unchecked UTF string
207 * @return string The string, validated and in normal form
208 */
209 function toNFC( $str ) {
210 $pos = strspn( $str, UTF8_ASCII_RANGE );
211 $len = strlen( $str );
212
213 if( $pos == $len ) {
214 /**
215 * ASCII strings return immediately
216 */
217 return $str;
218 }
219
220 if( !isset( $GLOBALS['utfCheckNFC'] ) ) {
221 include( 'UtfNormalData.inc' );
222 }
223
224 return UtfNormal::recompose( $str, $pos, $len, $GLOBALS['utfCheckNFC'], $GLOBALS['utfCanonicalDecomp'] );
225 }
226
227 /**
228 * Validate and normalize a UTF string to NFKC
229 *
230 * @param string $str Unchecked UTF string
231 * @return string The string, validated and in normal form
232 */
233 function toNFKC( $str ) {
234 $pos = strspn( $str, UTF8_ASCII_RANGE );
235 $len = strlen( $str );
236
237 if( $pos == $len ) {
238 /**
239 * ASCII strings return immediately
240 */
241 return $str;
242 }
243
244 if( !isset( $GLOBALS['utfCheckNFKC'] ) ) {
245 include( 'UtfNormalDataK.inc' );
246 }
247 if( !isset( $GLOBALS['utfCanonicalComp'] ) ) {
248 include( 'UtfNormalData.inc' );
249 }
250
251 return UtfNormal::recompose( $str, $pos, $len, $GLOBALS['utfCheckNFKC'], $GLOBALS['utfCompatibilityDecomp'] );
252 }
253
254 /**
255 * Validate and normalize a UTF string to NFD
256 *
257 * @param string $str Unchecked UTF string
258 * @return string The string, validated and in normal form
259 */
260 function toNFD( $str ) {
261 $pos = strspn( $str, UTF8_ASCII_RANGE );
262 $len = strlen( $str );
263
264 if( $pos == $len ) {
265 /**
266 * ASCII strings return immediately
267 */
268 return $str;
269 }
270
271 if( !isset( $GLOBALS['utfCanonicalDecomp'] ) ) {
272 include( 'UtfNormalData.inc' );
273 }
274
275 return UtfNormal::decompose( $str, $pos, $len, $GLOBALS['utfCanonicalDecomp'] );
276 }
277
278 /**
279 * Validate and normalize a UTF string to NFKD
280 *
281 * @param string $str Unchecked UTF string
282 * @return string The string, validated and in normal form
283 */
284 function toNFKD( $str ) {
285 $pos = strspn( $str, UTF8_ASCII_RANGE );
286 $len = strlen( $str );
287
288 if( $pos == $len ) {
289 /**
290 * ASCII strings return immediately
291 */
292 return $str;
293 }
294
295 if( !isset( $GLOBALS['utfCompatibilityDecomp'] ) ) {
296 include( 'UtfNormalDataK.inc' );
297 }
298
299 return UtfNormal::decompose( $str, $pos, $len, $GLOBALS['utfCompatibilityDecomp'] );
300 }
301
302
303 ////////////////////////////////////////////////////////////////////////////
304 // Internal functions //
305 ////////////////////////////////////////////////////////////////////////////
306
307 /**
308 * Recompose a UTF string
309 *
310 * @param string $str Unchecked UTF string
311 * @param integer $pos Position of the first UTF char (in bytes)
312 * @param integer $len Length of the string (in bytes)
313 * @param array $qc Quick-check array, passed by reference but never modified
314 * @param array $decomp_map Decomposition mapping, passed by reference but never modified
315 * @return string The string, validated and recomposed
316 *
317 * @access private
318 */
319 function recompose( $str, $pos, $len, &$qc, &$decomp_map ) {
320 global $utfCombiningClass, $utfCanonicalComp, $utfJamoType, $utfJamoIndex;
321
322 /**
323 * Buffer the last ASCII char before the UTF-8 stuff if applicable
324 */
325 $tmp = '';
326 $i = $tmp_pos = $last_cc = 0;
327
328 if( $pos ) {
329 $buffer = array(++$i => $str[$pos - 1] );
330 } else {
331 $buffer = array();
332 }
333
334 /**
335 * UTF char length array
336 *
337 * This array is used to determine the length of a UTF character. Be $c the
338 * result of ($str[$pos] & "\xF0") --where $str is the string we're operating
339 * on and $pos the position of the cursor--, if $utf_len_mask[$c] does not
340 * exist, the byte is an ASCII char. Otherwise, if $utf_len_mask[$c] is greater
341 * than 0, we have a the leading byte of a multibyte character whose length is
342 * $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
343 */
344 $utf_len_mask = array(
345 /**
346 * Leading bytes masks
347 */
348 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
349
350 /**
351 * Trailing bytes masks
352 */
353 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
354 );
355
356 $extra_check = array(
357 "\xED"=>1, "\xEF"=>1, "\xC0"=>1, "\xC1"=>1, "\xE0"=>1, "\xF0"=>1,
358 "\xF4"=>1, "\xF5"=>1, "\xF6"=>1, "\xF7"=>1, "\xF8"=>1, "\xF9"=>1,
359 "\xFA"=>1, "\xFB"=>1, "\xFC"=>1, "\xFD"=>1, "\xFE"=>1, "\xFF"=>1
360 );
361
362 $utf_validation_mask = array(
363 2 => "\xE0\xC0",
364 3 => "\xF0\xC0\xC0",
365 4 => "\xF8\xC0\xC0\xC0"
366 );
367
368 $utf_validation_check = array(
369 2 => "\xC0\x80",
370 3 => "\xE0\x80\x80",
371 4 => "\xF0\x80\x80\x80"
372 );
373
374 ////////////////////////////////////////////////////////////////////////
375 // Main loop //
376 ////////////////////////////////////////////////////////////////////////
377
378 do {
379 ////////////////////////////////////////////////////////////////////
380 // STEP 0: Capture the current char and buffer it //
381 ////////////////////////////////////////////////////////////////////
382
383 $c = $str[$pos];
384 $c_mask = $c & "\xF0";
385
386 if( isset( $utf_len_mask[$c_mask] ) ) {
387 /**
388 * Byte at $pos is either a leading byte or a missplaced trailing byte
389 */
390 if( $utf_len = $utf_len_mask[$c_mask] ) {
391 /**
392 * Capture the char
393 */
394 $buffer[++$i & 7] = $utf_char = substr( $str, $pos, $utf_len );
395
396 /**
397 * Let's find out if a thorough check is needed
398 */
399 if( isset( $qc[$utf_char] ) ) {
400 /**
401 * If the UTF char is in the qc array then it may not be in normal
402 * form. We do nothing here, the actual processing is below this
403 * "if" block
404 */
405 } elseif( isset( $utfCombiningClass[$utf_char] ) ) {
406 if( $utfCombiningClass[$utf_char] < $last_cc ) {
407 /**
408 * A combining character that is NOT canonically ordered
409 */
410 } else {
411 /**
412 * A combining character that IS canonically ordered, skip
413 * to the next char
414 */
415 $last_cc = $utfCombiningClass[$utf_char];
416
417 $pos += $utf_len;
418 continue;
419 }
420 } else {
421 /**
422 * At this point, $utf_char holds a UTF char that we know
423 * is not a NF[K]C_QC and is not a combining character. It can
424 * be a singleton, a canonical composite, a replacement char or
425 * an even an ill-formed bunch of bytes. Let's find out
426 */
427 $last_cc = 0;
428
429 /**
430 * Check that we have the correct number of trailing bytes
431 */
432 if( ( $utf_char & $utf_validation_mask[$utf_len] ) != $utf_validation_check[$utf_len] ) {
433 /**
434 * Current char isn't well-formed or legal: either one or
435 * several trailing bytes are missing, or the Unicode char
436 * has been encoded in a five- or six- byte sequence
437 */
438 if( $utf_char[0] >= "\xF8" ) {
439 if( $utf_char[0] < "\xF8" ) {
440 $trailing_bytes = 3;
441 } elseif( $utf_char[0] < "\xFC" ) {
442 $trailing_bytes = 4;
443 }
444 if( $utf_char[0] > "\xFD" ) {
445 $trailing_bytes = 0;
446 } else {
447 $trailing_bytes = 5;
448 }
449 } else {
450 $trailing_bytes = $utf_len - 1;
451 }
452
453 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
454 $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
455 $tmp_pos = $pos;
456
457 continue;
458 }
459
460 if( isset( $extra_check[$c] ) ) {
461 switch( $c ) {
462 /**
463 * Note: 0xED is quite common in Korean
464 */
465 case "\xED":
466 if( $utf_char >= "\xED\xA0\x80" ) {
467 /**
468 * Surrogates (0xD800..0xDFFF) are not allowed in UTF-8
469 * (UTF sequence 0xEDA080..0xEDBFBF)
470 */
471 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
472 $pos += $utf_len;
473 $tmp_pos = $pos;
474 continue 2;
475 }
476 break;
477
478 /**
479 * Note: 0xEF is quite common in Japanese
480 */
481 case "\xEF":
482 if( $utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF" ) {
483 /**
484 * 0xFFFE and 0xFFFF are explicitly disallowed
485 * (UTF sequence 0xEFBFBE..0xEFBFBF)
486 */
487 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
488 $pos += $utf_len;
489 $tmp_pos = $pos;
490 continue 2;
491 }
492 break;
493
494 case "\xC0":
495 case "\xC1":
496 if( $utf_char <= "\xC1\xBF" ) {
497 /**
498 * Overlong sequence: Unicode char 0x00..0x7F encoded as a
499 * double-byte UTF char
500 */
501 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
502 $pos += $utf_len;
503 $tmp_pos = $pos;
504 continue 2;
505 }
506 break;
507
508 case "\xE0":
509 if( $utf_char <= "\xE0\x9F\xBF" ) {
510 /**
511 * Unicode char 0x0000..0x07FF encoded in 3 bytes
512 */
513 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
514 $pos += $utf_len;
515 $tmp_pos = $pos;
516 continue 2;
517 }
518 break;
519
520 case "\xF0":
521 if( $utf_char <= "\xF0\x8F\xBF\xBF" ) {
522 /**
523 * Unicode char 0x0000..0xFFFF encoded in 4 bytes
524 */
525 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
526 $pos += $utf_len;
527 $tmp_pos = $pos;
528 continue 2;
529 }
530 break;
531
532 default:
533 /**
534 * Five- and six- byte sequences do not need being checked for here anymore
535 */
536 if( $utf_char > UTF8_MAX ) {
537 /**
538 * Out of the Unicode range
539 */
540 if( $utf_char[0] < "\xF8" ) {
541 $trailing_bytes = 3;
542 } elseif( $utf_char[0] < "\xFC" ) {
543 $trailing_bytes = 4;
544 } elseif( $utf_char[0] > "\xFD" ) {
545 $trailing_bytes = 0;
546 } else {
547 $trailing_bytes = 5;
548 }
549
550 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . UTF8_REPLACEMENT;
551 $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
552 $tmp_pos = $pos;
553 continue 2;
554 }
555 }
556 }
557
558 /**
559 * The char is a valid starter, move the cursor and go on
560 */
561 $pos += $utf_len;
562 continue;
563 }
564 } else {
565 /**
566 * A trailing byte came out of nowhere, we will advance the cursor
567 * and treat the this byte and all following trailing bytes as if
568 * each of them was a Unicode replacement char
569 */
570 $spn = strspn( $str, UTF8_TRAILING_BYTES, $pos );
571 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . str_repeat( UTF8_REPLACEMENT, $spn );
572
573 $pos += $spn;
574 $tmp_pos = $pos;
575 continue;
576 }
577
578
579 ////////////////////////////////////////////////////////////////////
580 // STEP 1: Decompose current char //
581 ////////////////////////////////////////////////////////////////////
582
583 /**
584 * We have found a character that is either:
585 * - in the NFC_QC/NFKC_QC list
586 * - a non-starter char that is not canonically ordered
587 *
588 * We are going to capture the shortest UTF sequence that satisfies
589 * these two conditions:
590 *
591 * 1 - If the sequence does not start at the begginning of the string,
592 * it must begin with a starter, and that starter must not have the
593 * NF[K]C_QC property equal to "MAYBE"
594 *
595 * 2 - If the sequence does not end at the end of the string, it must end
596 * with a non-starter and be immediately followed by a starter that
597 * is not on the QC list
598 */
599 $utf_seq = array();
600 $last_cc = 0;
601 $lpos = $pos;
602 $pos += $utf_len;
603
604 if( isset( $decomp_map[$utf_char] ) ) {
605 $_pos = 0;
606 $_len = strlen( $decomp_map[$utf_char] );
607 do {
608 $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
609
610 if( isset( $_utf_len ) ) {
611 $utf_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
612 $_pos += $_utf_len;
613 } else {
614 $utf_seq[] = $decomp_map[$utf_char][$_pos];
615 ++$_pos;
616 }
617 }
618 while( $_pos < $_len );
619 } else {
620 /**
621 * The char is not decomposable
622 */
623 $utf_seq = array( $utf_char );
624 }
625
626
627 ////////////////////////////////////////////////////////////////
628 // STEP 2: Capture the starter //
629 ////////////////////////////////////////////////////////////////
630
631 /**
632 * Check out the combining class of the first character of the UTF sequence
633 */
634 $k = 0;
635 if( isset( $utfCombiningClass[$utf_seq[0]] ) || $qc[$utf_char] == UNICODE_QC_MAYBE ) {
636 /**
637 * Not a starter, inspect previous characters
638 *
639 * The last 8 characters are kept in a buffer so that we don't have
640 * to capture them everytime. This is enough for all real-life strings
641 * but even if it wasn't, we can capture characters in backward mode,
642 * although it is slower than this method.
643 *
644 * In the following loop, $j starts at the previous buffered character
645 * ($i - 1, because current character is at offset $i) and process them
646 * in backward mode until we find a starter.
647 *
648 * $k is the index on each UTF character inside of our UTF sequence.
649 * At this time, $utf_seq contains one or more characters numbered 0 to
650 * n. $k starts at 0 and for each char we prepend we pre-decrement it
651 * and for numbering
652 */
653 $starter_found = 0;
654 $j_min = max(1, $i - 7 );
655 for( $j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j ) {
656 $utf_char = $buffer[$j & 7];
657 $lpos -= strlen( $utf_char );
658
659 if( isset( $decomp_map[$utf_char] ) ) {
660 /**
661 * The char is a composite, decompose for storage
662 */
663 $decomp_seq = array();
664 $_pos = 0;
665 $_len = strlen( $decomp_map[$utf_char] );
666 do {
667 $c = $decomp_map[$utf_char][$_pos];
668 $_utf_len =& $utf_len_mask[$c & "\xF0"];
669
670 if( isset( $_utf_len ) ) {
671 $decomp_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
672 $_pos += $_utf_len;
673 } else {
674 $decomp_seq[] = $c;
675 ++$_pos;
676 }
677 }
678 while( $_pos < $_len );
679
680 /**
681 * Prepend the UTF sequence with our decomposed sequence
682 */
683 if( isset( $decomp_seq[1] ) ) {
684 /**
685 * The char expanded into several chars
686 */
687 $decomp_cnt = count( $decomp_seq );
688 foreach( $decomp_seq as $decomp_i => $decomp_char ) {
689 $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
690 }
691 $k -= $decomp_cnt;
692 } else {
693 /**
694 * Decomposed to a single char, easier to prepend
695 */
696 $utf_seq[--$k] = $decomp_seq[0];
697 }
698 } else {
699 $utf_seq[--$k] = $utf_char;
700 }
701
702 if( !isset( $utfCombiningClass[$utf_seq[$k]] ) ) {
703 /**
704 * We have found our starter
705 */
706 $starter_found = 1;
707 break;
708 }
709 }
710
711 if( !$starter_found && $lpos > $tmp_pos ) {
712 /**
713 * The starter was not found in the buffer, let's rewind some more
714 */
715 do {
716 /**
717 * $utf_len_mask contains the masks of both leading bytes and
718 * trailing bytes. If $utf_en > 0 then it's a leading byte,
719 * otherwise it's a trailing byte.
720 */
721 $c = $str[--$lpos];
722 $c_mask = $c & "\xF0";
723
724 if( isset( $utf_len_mask[$c_mask] ) ) {
725 /**
726 * UTF byte
727 */
728 if( $utf_len = $utf_len_mask[$c_mask] ) {
729 /**
730 * UTF *leading* byte
731 */
732 $utf_char = substr( $str, $lpos, $utf_len );
733
734 if( isset( $decomp_map[$utf_char] ) ) {
735 /**
736 * Decompose the character
737 */
738 $decomp_seq = array();
739 $_pos = 0;
740 $_len = strlen( $decomp_map[$utf_char] );
741 do {
742 $c = $decomp_map[$utf_char][$_pos];
743 $_utf_len =& $utf_len_mask[$c & "\xF0"];
744
745 if( isset( $_utf_len ) ) {
746 $decomp_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
747 $_pos += $_utf_len;
748 } else {
749 $decomp_seq[] = $c;
750 ++$_pos;
751 }
752 }
753 while( $_pos < $_len );
754
755 /**
756 * Prepend the UTF sequence with our decomposed sequence
757 */
758 if( isset( $decomp_seq[1] ) ) {
759 /**
760 * The char expanded into several chars
761 */
762 $decomp_cnt = count( $decomp_seq );
763 foreach( $decomp_seq as $decomp_i => $utf_char ) {
764 $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
765 }
766 $k -= $decomp_cnt;
767 } else {
768 /**
769 * Decomposed to a single char, easier to prepend
770 */
771 $utf_seq[--$k] = $decomp_seq[0];
772 }
773 } else {
774 $utf_seq[--$k] = $utf_char;
775 }
776 }
777 } else {
778 /**
779 * ASCII char
780 */
781 $utf_seq[--$k] = $c;
782 }
783 }
784 while( $lpos > $tmp_pos );
785 }
786 }
787
788
789 ////////////////////////////////////////////////////////////////
790 // STEP 3: Capture following combining modifiers //
791 ////////////////////////////////////////////////////////////////
792
793 while( $pos < $len ) {
794 $c_mask = $str[$pos] & "\xF0";
795
796 if( isset( $utf_len_mask[$c_mask] ) ) {
797 if( $utf_len = $utf_len_mask[$c_mask] ) {
798 $utf_char = substr( $str, $pos, $utf_len );
799 } else {
800 /**
801 * A trailing byte came out of nowhere
802 *
803 * Trailing bytes are replaced with Unicode replacement chars,
804 * we will just ignore it for now, break out of the loop
805 * as if it was a starter (replacement chars ARE starters)
806 * and let the next loop replace it
807 */
808 break;
809 }
810
811 if( isset( $utfCombiningClass[$utf_char] ) || isset( $qc[$utf_char] ) ) {
812 /**
813 * Combining character, add it to the sequence and move the cursor
814 */
815 if( isset( $decomp_map[$utf_char] ) ) {
816 /**
817 * Decompose the character
818 */
819 $_pos = 0;
820 $_len = strlen( $decomp_map[$utf_char] );
821 do {
822 $c = $decomp_map[$utf_char][$_pos];
823 $_utf_len =& $utf_len_mask[$c & "\xF0"];
824
825 if( isset( $_utf_len ) ) {
826 $utf_seq[] = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
827 $_pos += $_utf_len;
828 } else {
829 $utf_seq[] = $c;
830 ++$_pos;
831 }
832 }
833 while( $_pos < $_len );
834 } else {
835 $utf_seq[] = $utf_char;
836 }
837
838 $pos += $utf_len;
839 } else {
840 /**
841 * Combining class 0 and no QC, break out of the loop
842 *
843 * Note: we do not know if that character is valid. If
844 * it's not, the next iteration will replace it
845 */
846 break;
847 }
848 } else {
849 /**
850 * ASCII chars are starters
851 */
852 break;
853 }
854 }
855
856
857 ////////////////////////////////////////////////////////////////
858 // STEP 4: Sort and combine //
859 ////////////////////////////////////////////////////////////////
860
861 /**
862 * Here we sort...
863 */
864 $k_max = $k + count( $utf_seq );
865 if( !$k && $k_max == 1 ) {
866 /**
867 * There is only one char in the UTF sequence, add it then
868 * jump to the next iteration of main loop
869 *
870 * Note: the two commented lines below can be enabled under PHP5
871 * for a very small performance gain in most cases
872 */
873 // if( substr_compare( $str, $utf_seq[0], $lpos, $pos - $lpos ) ) {
874 $tmp .= substr( $str, $tmp_pos, $lpos - $tmp_pos ) . $utf_seq[0];
875 $tmp_pos = $pos;
876 // }
877
878 continue;
879 }
880
881 /**
882 * ...there we combine
883 */
884 if( isset( $utfCombiningClass[$utf_seq[$k]] ) ) {
885 $starter = $nf_seq = '';
886 } else {
887 $starter = $utf_seq[$k++];
888 $nf_seq = '';
889 }
890 $utf_sort = array();
891
892 /**
893 * We add an empty char at the end of the UTF char sequence.
894 * It will act as a starter and trigger the sort/combine routine
895 * at the end of the string without altering it
896 */
897 $utf_seq[] = '';
898
899 do {
900 $utf_char = $utf_seq[$k++];
901
902 if( isset( $utfCombiningClass[$utf_char] ) ) {
903 $utf_sort[$utfCombiningClass[$utf_char]][] = $utf_char;
904 } else {
905 if( empty( $utf_sort ) ) {
906 /**
907 * No combining characters... check for a composite
908 * of the two starters
909 */
910 if( isset( $utfCanonicalComp[$starter . $utf_char] ) ) {
911 /**
912 * Good ol' composite character
913 */
914 $starter = $utfCanonicalComp[$starter . $utf_char];
915 } elseif( isset( $utfJamoType[$utf_char] ) ) {
916 /**
917 * Current char is a composable jamo
918 */
919 if( isset( $utfJamoType[$starter] )
920 && $utfJamoType[$starter] == UNICODE_JAMO_L
921 && $utfJamoType[$utf_char] == UNICODE_JAMO_V ) {
922 /**
923 * We have a L jamo followed by a V jamo, we are going
924 * to prefetch the next char to see if it's a T jamo
925 */
926 if( isset( $utfJamoType[$utf_seq[$k]] ) && $utfJamoType[$utf_seq[$k]] == UNICODE_JAMO_T ) {
927 /**
928 * L+V+T jamos, combine to a LVT Hangul syllable
929 * ($k is incremented)
930 */
931 $cp = $utfJamoIndex[$starter] + $utfJamoIndex[$utf_char] + $utfJamoIndex[$utf_seq[$k]];
932
933 ++$k;
934 } else {
935 /**
936 * L+V jamos, combine to a LV Hangul syllable
937 */
938 $cp = $utfJamoIndex[$starter] + $utfJamoIndex[$utf_char];
939 }
940
941 $starter = chr( 0xE0 | ( $cp >> 12 ) ) . chr( 0x80 | ( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 | ( $cp & 0x3F ) );
942 } else {
943 /**
944 * Non-composable jamo, just add it to the sequence
945 */
946 $nf_seq .= $starter;
947 $starter = $utf_char;
948 }
949 } else {
950 /**
951 * No composite, just add the first starter to the sequence
952 * then continue with the other one
953 */
954 $nf_seq .= $starter;
955 $starter = $utf_char;
956 }
957 } else {
958 ksort( $utf_sort );
959
960 /**
961 * For each class of combining characters
962 */
963 foreach( $utf_sort as $cc => $utf_chars ) {
964 $j = 0;
965
966 do {
967 /**
968 * Look for a composite
969 */
970 if( isset( $utfCanonicalComp[$starter . $utf_chars[$j]] ) ) {
971 /**
972 * Found a composite, replace the starter
973 */
974 $starter = $utfCanonicalComp[$starter . $utf_chars[$j]];
975 unset( $utf_sort[$cc][$j] );
976 } else {
977 /**
978 * No composite, all following characters in that
979 * class are blocked
980 */
981 break;
982 }
983 }
984 while( isset( $utf_sort[$cc][++$j] ) );
985 }
986
987 /**
988 * Add the starter to the normalized sequence, followed by
989 * non-starters in canonical order
990 */
991 $nf_seq .= $starter;
992 foreach( $utf_sort as $utf_chars ) {
993 if( !empty( $utf_chars ) ) {
994 $nf_seq .= implode( '', $utf_chars );
995 }
996 }
997
998 /**
999 * Reset the array and go on
1000 */
1001 $utf_sort = array();
1002 $starter = $utf_char;
1003 }
1004 }
1005 }
1006 while( $k <= $k_max );
1007
1008 $tmp .= substr( $str, $tmp_pos, $lpos - $tmp_pos ) . $nf_seq;
1009 $tmp_pos = $pos;
1010 } else {
1011 /**
1012 * Only a ASCII char can make the program get here
1013 *
1014 * First we skip the current byte with ++$pos, then we quickly
1015 * skip following ASCII chars with strspn().
1016 *
1017 * The first two "if"'s here can be removed, with the consequences
1018 * of being faster on latin text (lots of ASCII) and slower on
1019 * multi-byte text (where the only ASCII chars are spaces and punctuation)
1020 */
1021 if( ++$pos != $len ) {
1022 if( $str[$pos] < "\x80" ) {
1023 $pos += strspn( $str, UTF8_ASCII_RANGE, ++$pos );
1024 $buffer[++$i & 7] = $str[$pos - 1];
1025 } else {
1026 $buffer[++$i & 7] = $c;
1027 }
1028 }
1029 }
1030 }
1031 while( $pos < $len );
1032
1033 /**
1034 * Now is time to return the string
1035 */
1036 if( $tmp_pos ) {
1037 /**
1038 * If the $tmp_pos cursor is not at the beggining of the string then at least
1039 * one character was not in normal form. Replace $str with the fixed version
1040 */
1041 if( $tmp_pos == $len ) {
1042 /**
1043 * The $tmp_pos cursor is at the end of $str, therefore $tmp holds the
1044 * whole $str
1045 */
1046 return $tmp;
1047 } else {
1048 /**
1049 * The rightmost chunk of $str has not been appended to $tmp yet
1050 */
1051 return $tmp . substr( $str, $tmp_pos );
1052 }
1053 }
1054
1055 /**
1056 * The string was already in normal form
1057 */
1058 return $str;
1059 }
1060
1061 /**
1062 * Decompose a UTF string
1063 *
1064 * @param string $str UTF string
1065 * @param integer $pos Position of the first UTF char (in bytes)
1066 * @param integer $len Length of the string (in bytes)
1067 * @param array $decomp_map Decomposition mapping, passed by reference but never modified
1068 * @return string The string, decomposed and sorted canonically
1069 *
1070 * @access private
1071 */
1072 function decompose( $str, $pos, $len, &$decomp_map ) {
1073 global $utfCombiningClass, $utfCanonicalDecomp;
1074
1075 /**
1076 * UTF char length array
1077 */
1078 $utf_len_mask = array(
1079 /**
1080 * Leading bytes masks
1081 */
1082 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
1083
1084 /**
1085 * Trailing bytes masks
1086 */
1087 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
1088 );
1089
1090 /**
1091 * Some extra checks are triggered on the first byte of a UTF sequence
1092 */
1093 $extra_check = array(
1094 "\xED"=>1, "\xEF"=>1, "\xC0"=>1, "\xC1"=>1, "\xE0"=>1, "\xF0"=>1,
1095 "\xF4"=>1, "\xF5"=>1, "\xF6"=>1, "\xF7"=>1, "\xF8"=>1, "\xF9"=>1,
1096 "\xFA"=>1, "\xFB"=>1, "\xFC"=>1, "\xFD"=>1, "\xFE"=>1, "\xFF"=>1
1097 );
1098
1099 /**
1100 * These masks are used to check if a UTF sequence is well formed.
1101 * Here are the only 3 lengths we acknowledge:
1102 * - 2-byte: 110? ???? 10?? ????
1103 * - 3-byte: 1110 ???? 10?? ???? 10?? ????
1104 * - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
1105 *
1106 * Note that 5- and 6- byte sequences are automatically discarded
1107 */
1108 $utf_validation_mask = array(
1109 2 => "\xE0\xC0",
1110 3 => "\xF0\xC0\xC0",
1111 4 => "\xF8\xC0\xC0\xC0"
1112 );
1113 $utf_validation_check = array(
1114 2 => "\xC0\x80",
1115 3 => "\xE0\x80\x80",
1116 4 => "\xF0\x80\x80\x80"
1117 );
1118
1119 $tmp = '';
1120 $starter_pos = $pos;
1121 $tmp_pos = $last_cc = $sort = $dump = 0;
1122 $utf_sort = array();
1123
1124
1125 ////////////////////////////////////////////////////////////////////////
1126 // Main loop //
1127 ////////////////////////////////////////////////////////////////////////
1128
1129 do {
1130 ////////////////////////////////////////////////////////////////////
1131 // STEP 0: Capture the current char //
1132 ////////////////////////////////////////////////////////////////////
1133
1134 $cur_mask = $str[$pos] & "\xF0";
1135 if( isset( $utf_len_mask[$cur_mask] ) ) {
1136 if( $utf_len = $utf_len_mask[$cur_mask] ) {
1137 /**
1138 * Multibyte char
1139 */
1140 $utf_char = substr( $str, $pos, $utf_len );
1141 $pos += $utf_len;
1142 } else {
1143 /**
1144 * A trailing byte came out of nowhere, we will treat it and all
1145 * following trailing bytes as if each of them was a Unicode
1146 * replacement char and we will advance the cursor
1147 */
1148 $spn = strspn( $str, UTF8_TRAILING_BYTES, $pos );
1149
1150 if( $dump ) {
1151 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1152
1153 /**
1154 * Dump combiners
1155 */
1156 if( !empty( $utf_sort ) ) {
1157 if( $sort ) {
1158 ksort( $utf_sort );
1159 }
1160
1161 foreach( $utf_sort as $utf_chars ) {
1162 $tmp .= implode( '', $utf_chars );
1163 }
1164 }
1165
1166 $tmp .= str_repeat( UTF8_REPLACEMENT, $spn );
1167 $dump = $sort = 0;
1168 } else {
1169 $tmp .= substr( $str, $tmp_pos, $pos - $tmp_pos ) . str_repeat( UTF8_REPLACEMENT, $spn );
1170 }
1171
1172 $pos += $spn;
1173 $tmp_pos = $starter_pos = $pos;
1174
1175 $utf_sort = array();
1176 $last_cc = 0;
1177
1178 continue;
1179 }
1180
1181
1182 ////////////////////////////////////////////////////////////////////
1183 // STEP 1: Decide what to do with current char //
1184 ////////////////////////////////////////////////////////////////////
1185
1186 /**
1187 * Now, in that order:
1188 * - check if that character is decomposable
1189 * - check if that character is a non-starter
1190 * - check if that character requires extra checks to be performed
1191 */
1192 if( isset( $decomp_map[$utf_char] ) ) {
1193 /**
1194 * Decompose the char
1195 */
1196 $_pos = 0;
1197 $_len = strlen( $decomp_map[$utf_char] );
1198
1199 do {
1200 $c = $decomp_map[$utf_char][$_pos];
1201 $_utf_len =& $utf_len_mask[$c & "\xF0"];
1202
1203 if( isset( $_utf_len ) ) {
1204 $_utf_char = substr( $decomp_map[$utf_char], $_pos, $_utf_len );
1205 $_pos += $_utf_len;
1206
1207 if( isset( $utfCombiningClass[$_utf_char] ) ) {
1208 /**
1209 * The character decomposed to a non-starter, buffer it for sorting
1210 */
1211 $utf_sort[$utfCombiningClass[$_utf_char]][] = $_utf_char;
1212
1213 if( $utfCombiningClass[$_utf_char] < $last_cc ) {
1214 /**
1215 * Not canonically ordered, will require sorting
1216 */
1217 $sort = $dump = 1;
1218 } else {
1219 $dump = 1;
1220 $last_cc = $utfCombiningClass[$_utf_char];
1221 }
1222 } else {
1223 /**
1224 * This character decomposition contains a starter,
1225 * dump the buffer and continue
1226 */
1227 if( $dump ) {
1228 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1229
1230 /**
1231 * Dump combiners
1232 */
1233 if( !empty( $utf_sort ) ) {
1234 if( $sort ) {
1235 ksort( $utf_sort );
1236 }
1237
1238 foreach( $utf_sort as $utf_chars ) {
1239 $tmp .= implode( '', $utf_chars );
1240 }
1241 }
1242
1243 $tmp .= $_utf_char;
1244 $dump = $sort = 0;
1245 } else {
1246 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos ) . $_utf_char;
1247 }
1248
1249 $tmp_pos = $starter_pos = $pos;
1250 $utf_sort = array();
1251 $last_cc = 0;
1252 }
1253 } else {
1254 /**
1255 * This character decomposition contains an ASCII char,
1256 * which is a starter. Dump the buffer and continue
1257 */
1258 ++$_pos;
1259 if( $dump ) {
1260 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1261
1262 /**
1263 * Dump combiners
1264 */
1265 if( !empty( $utf_sort ) ) {
1266 if( $sort ) {
1267 ksort( $utf_sort );
1268 }
1269
1270 foreach( $utf_sort as $utf_chars ) {
1271 $tmp .= implode( '', $utf_chars );
1272 }
1273 }
1274
1275 $tmp .= $c;
1276 $dump = $sort = 0;
1277 } else {
1278 $tmp .= substr( $str, $tmp_pos, $pos - $utf_len - $tmp_pos ) . $c;
1279 }
1280
1281 $tmp_pos = $starter_pos = $pos;
1282 $utf_sort = array();
1283 $last_cc = 0;
1284 }
1285 }
1286 while( $_pos < $_len );
1287 } elseif( isset( $utfCombiningClass[$utf_char] ) ) {
1288 /**
1289 * Combining character
1290 */
1291 if( $utfCombiningClass[$utf_char] < $last_cc ) {
1292 /**
1293 * Not in canonical order
1294 */
1295 $sort = $dump = 1;
1296 } else {
1297 $last_cc = $utfCombiningClass[$utf_char];
1298 }
1299
1300 $utf_sort[$utfCombiningClass[$utf_char]][] = $utf_char;
1301 } else {
1302 /**
1303 * Non-decomposable starter, check out if it's a Hangul syllable
1304 */
1305 if( $utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST ) {
1306 /**
1307 * Nope, regular UTF char, check that we have the correct number of trailing bytes
1308 */
1309 if( ( $utf_char & $utf_validation_mask[$utf_len] ) != $utf_validation_check[$utf_len] ) {
1310 /**
1311 * Current char isn't well-formed or legal: either one or
1312 * several trailing bytes are missing, or the Unicode char
1313 * has been encoded in a five- or six- byte sequence
1314 */
1315 if( $utf_char[0] >= "\xF8" ) {
1316 if( $utf_char[0] < "\xF8" ) {
1317 $trailing_bytes = 3;
1318 } elseif( $utf_char[0] < "\xFC" ) {
1319 $trailing_bytes = 4;
1320 }
1321 if( $utf_char[0] > "\xFD" ) {
1322 $trailing_bytes = 0;
1323 } else {
1324 $trailing_bytes = 5;
1325 }
1326 } else {
1327 $trailing_bytes = $utf_len - 1;
1328 }
1329
1330 /**
1331 * Move the cursor back to its original position then advance
1332 * it to the position it should be at
1333 */
1334 $pos -= $utf_len;
1335 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1336
1337 if( !empty( $utf_sort ) ) {
1338 ksort( $utf_sort );
1339
1340 foreach( $utf_sort as $utf_chars ) {
1341 $tmp .= implode( '', $utf_chars );
1342 }
1343 $utf_sort = array();
1344 }
1345
1346 $tmp .= UTF8_REPLACEMENT;
1347 $dump = $sort = 0;
1348
1349 $pos += strspn( $str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes );
1350 $tmp_pos = $pos;
1351 continue;
1352 }
1353
1354 if( isset( $extra_check[$utf_char[0]] ) ) {
1355 switch( $utf_char[0] ) {
1356 /**
1357 * Note: 0xED is quite common in Korean
1358 */
1359 case "\xED":
1360 if( $utf_char >= "\xED\xA0\x80" ) {
1361 /**
1362 * Surrogates (0xD800..0xDFFF) are not allowed in UTF-8
1363 * (UTF sequence 0xEDA080..0xEDBFBF)
1364 */
1365 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1366
1367 if( !empty( $utf_sort ) ) {
1368 ksort( $utf_sort );
1369
1370 foreach( $utf_sort as $utf_chars ) {
1371 $tmp .= implode( '', $utf_chars );
1372 }
1373 $utf_sort = array();
1374 }
1375
1376 $tmp .= UTF8_REPLACEMENT;
1377 $dump = $sort = 0;
1378
1379 $tmp_pos = $starter_pos = $pos;
1380 continue 2;
1381 }
1382 break;
1383
1384 /**
1385 * Note: 0xEF is quite common in Japanese
1386 */
1387 case "\xEF":
1388 if( $utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF" ) {
1389 /**
1390 * 0xFFFE and 0xFFFF are explicitly disallowed
1391 * (UTF sequence 0xEFBFBE..0xEFBFBF)
1392 */
1393 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1394
1395 if( !empty( $utf_sort ) ) {
1396 ksort( $utf_sort );
1397
1398 foreach( $utf_sort as $utf_chars ) {
1399 $tmp .= implode( '', $utf_chars );
1400 }
1401 $utf_sort = array();
1402 }
1403
1404 $tmp .= UTF8_REPLACEMENT;
1405 $dump = $sort = 0;
1406
1407 $tmp_pos = $starter_pos = $pos;
1408 continue 2;
1409 }
1410 break;
1411
1412 case "\xC0":
1413 case "\xC1":
1414 if( $utf_char <= "\xC1\xBF" ) {
1415 /**
1416 * Overlong sequence: Unicode char 0x00..0x7F encoded as a
1417 * double-byte UTF char
1418 */
1419 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1420
1421 if( !empty( $utf_sort ) ) {
1422 ksort( $utf_sort );
1423
1424 foreach( $utf_sort as $utf_chars ) {
1425 $tmp .= implode( '', $utf_chars );
1426 }
1427 $utf_sort = array();
1428 }
1429
1430 $tmp .= UTF8_REPLACEMENT;
1431 $dump = $sort = 0;
1432
1433 $tmp_pos = $starter_pos = $pos;
1434 continue 2;
1435 }
1436 break;
1437
1438 case "\xE0":
1439 if( $utf_char <= "\xE0\x9F\xBF" ) {
1440 /**
1441 * Unicode char 0x0000..0x07FF encoded in 3 bytes
1442 */
1443 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1444
1445 if( !empty( $utf_sort ) ) {
1446 ksort( $utf_sort );
1447
1448 foreach( $utf_sort as $utf_chars ) {
1449 $tmp .= implode( '', $utf_chars );
1450 }
1451 $utf_sort = array();
1452 }
1453
1454 $tmp .= UTF8_REPLACEMENT;
1455 $dump = $sort = 0;
1456
1457 $tmp_pos = $starter_pos = $pos;
1458 continue 2;
1459 }
1460 break;
1461
1462 case "\xF0":
1463 if( $utf_char <= "\xF0\x8F\xBF\xBF" ) {
1464 /**
1465 * Unicode char 0x0000..0xFFFF encoded in 4 bytes
1466 */
1467 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1468
1469 if( !empty( $utf_sort ) ) {
1470 ksort( $utf_sort );
1471
1472 foreach( $utf_sort as $utf_chars ) {
1473 $tmp .= implode( '', $utf_chars );
1474 }
1475 $utf_sort = array();
1476 }
1477
1478 $tmp .= UTF8_REPLACEMENT;
1479 $dump = $sort = 0;
1480
1481 $tmp_pos = $starter_pos = $pos;
1482 continue 2;
1483 }
1484 break;
1485
1486 default:
1487 if( $utf_char > UTF8_MAX ) {
1488 /**
1489 * Out of the Unicode range
1490 */
1491 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1492
1493 if( !empty( $utf_sort ) ) {
1494 ksort( $utf_sort );
1495
1496 foreach( $utf_sort as $utf_chars ) {
1497 $tmp .= implode( '', $utf_chars );
1498 }
1499 $utf_sort = array();
1500 }
1501
1502 $tmp .= UTF8_REPLACEMENT;
1503 $dump = $sort = 0;
1504
1505 $tmp_pos = $starter_pos = $pos;
1506 continue 2;
1507 }
1508 }
1509 }
1510 } else {
1511 /**
1512 * Hangul syllable
1513 */
1514 $idx = ( ( ( ord( $utf_char[0] ) & 0x0F ) << 12 ) | ( ( ord( $utf_char[1] ) & 0x3F ) << 6 ) | ( ord( $utf_char[2] ) & 0x3F ) ) - UNICODE_HANGUL_SBASE;
1515
1516 /**
1517 * LIndex can only range from 0 to 18, therefore it cannot influence
1518 * the first two bytes of the L Jamo, which allows us to hardcode
1519 * them (based on LBase).
1520 *
1521 * The same goes for VIndex, but for TIndex there's a catch: the value
1522 * of the third byte could exceed 0xBF and we would have to increment
1523 * the second byte
1524 */
1525 if( $tIndex = $idx % UNICODE_HANGUL_TCOUNT ) {
1526 if( $tIndex < 25 ) {
1527 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1528 $utf_char[8] = chr( 0xA7 + $tIndex );
1529 } else {
1530 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1531 $utf_char[8] = chr( 0x67 + $tIndex );
1532 }
1533 } else {
1534 $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1535 }
1536
1537 $utf_char[2] = chr( 0x80 + ( int ) ( $idx / UNICODE_HANGUL_NCOUNT ) );
1538 $utf_char[5] = chr( 0xA1 + ( int ) ( ( $idx % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT ) );
1539
1540
1541 /**
1542 * Just like other decompositions, the resulting Jamos must
1543 * be dumped to the tmp string
1544 */
1545 $dump = 1;
1546 }
1547
1548 /**
1549 * Do we need to dump stuff to the tmp string?
1550 */
1551 if( $dump ) {
1552 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1553
1554 /**
1555 * Dump combiners
1556 */
1557 if( !empty( $utf_sort ) ) {
1558 if( $sort ) {
1559 ksort( $utf_sort );
1560 }
1561
1562 foreach( $utf_sort as $utf_chars ) {
1563 $tmp .= implode( '', $utf_chars );
1564 }
1565 }
1566
1567 $tmp .= $utf_char;
1568 $dump = $sort = 0;
1569 $tmp_pos = $pos;
1570 }
1571
1572 $last_cc = 0;
1573 $utf_sort = array();
1574 $starter_pos = $pos;
1575 }
1576 } else {
1577 /**
1578 * ASCII char, which happens to be a starter (as any other ASCII char)
1579 */
1580 if( $dump ) {
1581 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1582
1583 /**
1584 * Dump combiners
1585 */
1586 if( !empty( $utf_sort ) ) {
1587 if( $sort ) {
1588 ksort( $utf_sort );
1589 }
1590
1591 foreach( $utf_sort as $utf_chars ) {
1592 $tmp .= implode( '', $utf_chars );
1593 }
1594 }
1595
1596 $tmp .= $str[$pos];
1597 $dump = $sort = 0;
1598 $tmp_pos = ++$pos;
1599
1600 $pos += strspn( $str, UTF8_ASCII_RANGE, $pos );
1601 } else {
1602 $pos += strspn( $str, UTF8_ASCII_RANGE, ++$pos );
1603 }
1604
1605 $last_cc = 0;
1606 $utf_sort = array();
1607 $starter_pos = $pos;
1608 }
1609 }
1610 while( $pos < $len );
1611
1612 /**
1613 * Now is time to return the string
1614 */
1615 if( $dump ) {
1616 $tmp .= substr( $str, $tmp_pos, $starter_pos - $tmp_pos );
1617
1618 /**
1619 * Dump combiners
1620 */
1621 if( !empty( $utf_sort ) ) {
1622 if( $sort ) {
1623 ksort( $utf_sort );
1624 }
1625
1626 foreach( $utf_sort as $utf_chars ) {
1627 $tmp .= implode( '', $utf_chars );
1628 }
1629 }
1630
1631 return $tmp;
1632
1633 } elseif( $tmp_pos ) {
1634 /**
1635 * If the $tmp_pos cursor was moved then at least one character was not in
1636 * normal form. Replace $str with the fixed version
1637 */
1638 if( $tmp_pos == $len ) {
1639 /**
1640 * The $tmp_pos cursor is at the end of $str, therefore $tmp holds
1641 * the whole $str
1642 */
1643 return $tmp;
1644 } else {
1645 /**
1646 * The rightmost chunk of $str has not been appended to $tmp yet
1647 */
1648 return $tmp . substr( $str, $tmp_pos );
1649 }
1650 }
1651
1652 /**
1653 * The string was already in normal form
1654 */
1655 return $str;
1656 }
1657 }
1658
1659 }