Update formatting of includes/normal/
[lhc/web/wiklou.git] / includes / normal / UtfNormal.php
1 <?php
2 /**
3 * Unicode normalization routines
4 *
5 * Copyright © 2004 Brion Vibber <brion@pobox.com>
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup UtfNormal
25 */
26
27 /**
28 * @defgroup UtfNormal UtfNormal
29 */
30
31 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
32 define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
33
34 /**
35 * Unicode normalization routines for working with UTF-8 strings.
36 * Currently assumes that input strings are valid UTF-8!
37 *
38 * Not as fast as I'd like, but should be usable for most purposes.
39 * UtfNormal::toNFC() will bail early if given ASCII text or text
40 * it can quickly determine is already normalized.
41 *
42 * All functions can be called static.
43 *
44 * See description of forms at http://www.unicode.org/reports/tr15/
45 *
46 * @ingroup UtfNormal
47 */
48 class UtfNormal {
49 /**
50 * For using the ICU wrapper
51 */
52 const UNORM_NONE = 1;
53 const UNORM_NFD = 2;
54 const UNORM_NFKD = 3;
55 const UNORM_NFC = 4;
56 const UNORM_NFKC = 5;
57 const UNORM_FCD = 6;
58 const UNORM_DEFAULT = self::UNORM_NFC;
59
60 static $utfCombiningClass = null;
61 static $utfCanonicalComp = null;
62 static $utfCanonicalDecomp = null;
63
64 # Load compatibility decompositions on demand if they are needed.
65 static $utfCompatibilityDecomp = null;
66
67 static $utfCheckNFC;
68
69 /**
70 * The ultimate convenience function! Clean up invalid UTF-8 sequences,
71 * and convert to normal form C, canonical composition.
72 *
73 * Fast return for pure ASCII strings; some lesser optimizations for
74 * strings containing only known-good characters. Not as fast as toNFC().
75 *
76 * @param string $string a UTF-8 string
77 * @return string a clean, shiny, normalized UTF-8 string
78 */
79 static function cleanUp( $string ) {
80 if ( NORMALIZE_ICU ) {
81 $string = self::replaceForNativeNormalize( $string );
82
83 # UnicodeString constructor fails if the string ends with a
84 # head byte. Add a junk char at the end, we'll strip it off.
85 return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
86 } elseif ( NORMALIZE_INTL ) {
87 $string = self::replaceForNativeNormalize( $string );
88 $norm = normalizer_normalize( $string, Normalizer::FORM_C );
89 if ( $norm === null || $norm === false ) {
90 # normalizer_normalize will either return false or null
91 # (depending on which doc you read) if invalid utf8 string.
92 # quickIsNFCVerify cleans up invalid sequences.
93
94 if ( UtfNormal::quickIsNFCVerify( $string ) ) {
95 # if that's true, the string is actually already normal.
96 return $string;
97 } else {
98 # Now we are valid but non-normal
99 return normalizer_normalize( $string, Normalizer::FORM_C );
100 }
101 } else {
102 return $norm;
103 }
104 } elseif ( UtfNormal::quickIsNFCVerify( $string ) ) {
105 # Side effect -- $string has had UTF-8 errors cleaned up.
106 return $string;
107 } else {
108 return UtfNormal::NFC( $string );
109 }
110 }
111
112 /**
113 * Convert a UTF-8 string to normal form C, canonical composition.
114 * Fast return for pure ASCII strings; some lesser optimizations for
115 * strings containing only known-good characters.
116 *
117 * @param string $string a valid UTF-8 string. Input is not validated.
118 * @return string a UTF-8 string in normal form C
119 */
120 static function toNFC( $string ) {
121 if ( NORMALIZE_INTL )
122 return normalizer_normalize( $string, Normalizer::FORM_C );
123 elseif ( NORMALIZE_ICU )
124 return utf8_normalize( $string, self::UNORM_NFC );
125 elseif ( UtfNormal::quickIsNFC( $string ) )
126 return $string;
127 else
128 return UtfNormal::NFC( $string );
129 }
130
131 /**
132 * Convert a UTF-8 string to normal form D, canonical decomposition.
133 * Fast return for pure ASCII strings.
134 *
135 * @param string $string a valid UTF-8 string. Input is not validated.
136 * @return string a UTF-8 string in normal form D
137 */
138 static function toNFD( $string ) {
139 if ( NORMALIZE_INTL )
140 return normalizer_normalize( $string, Normalizer::FORM_D );
141 elseif ( NORMALIZE_ICU )
142 return utf8_normalize( $string, self::UNORM_NFD );
143 elseif ( preg_match( '/[\x80-\xff]/', $string ) )
144 return UtfNormal::NFD( $string );
145 else
146 return $string;
147 }
148
149 /**
150 * Convert a UTF-8 string to normal form KC, compatibility composition.
151 * This may cause irreversible information loss, use judiciously.
152 * Fast return for pure ASCII strings.
153 *
154 * @param string $string a valid UTF-8 string. Input is not validated.
155 * @return string a UTF-8 string in normal form KC
156 */
157 static function toNFKC( $string ) {
158 if ( NORMALIZE_INTL )
159 return normalizer_normalize( $string, Normalizer::FORM_KC );
160 elseif ( NORMALIZE_ICU )
161 return utf8_normalize( $string, self::UNORM_NFKC );
162 elseif ( preg_match( '/[\x80-\xff]/', $string ) )
163 return UtfNormal::NFKC( $string );
164 else
165 return $string;
166 }
167
168 /**
169 * Convert a UTF-8 string to normal form KD, compatibility decomposition.
170 * This may cause irreversible information loss, use judiciously.
171 * Fast return for pure ASCII strings.
172 *
173 * @param string $string a valid UTF-8 string. Input is not validated.
174 * @return string a UTF-8 string in normal form KD
175 */
176 static function toNFKD( $string ) {
177 if ( NORMALIZE_INTL )
178 return normalizer_normalize( $string, Normalizer::FORM_KD );
179 elseif ( NORMALIZE_ICU )
180 return utf8_normalize( $string, self::UNORM_NFKD );
181 elseif ( preg_match( '/[\x80-\xff]/', $string ) )
182 return UtfNormal::NFKD( $string );
183 else
184 return $string;
185 }
186
187 /**
188 * Load the basic composition data if necessary
189 * @private
190 */
191 static function loadData() {
192 if ( !isset( self::$utfCombiningClass ) ) {
193 require_once __DIR__ . '/UtfNormalData.inc';
194 }
195 }
196
197 /**
198 * Returns true if the string is _definitely_ in NFC.
199 * Returns false if not or uncertain.
200 * @param string $string a valid UTF-8 string. Input is not validated.
201 * @return bool
202 */
203 static function quickIsNFC( $string ) {
204 # ASCII is always valid NFC!
205 # If it's pure ASCII, let it through.
206 if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
207
208 UtfNormal::loadData();
209 $len = strlen( $string );
210 for ( $i = 0; $i < $len; $i++ ) {
211 $c = $string[$i];
212 $n = ord( $c );
213 if ( $n < 0x80 ) {
214 continue;
215 } elseif ( $n >= 0xf0 ) {
216 $c = substr( $string, $i, 4 );
217 $i += 3;
218 } elseif ( $n >= 0xe0 ) {
219 $c = substr( $string, $i, 3 );
220 $i += 2;
221 } elseif ( $n >= 0xc0 ) {
222 $c = substr( $string, $i, 2 );
223 $i++;
224 }
225 if ( isset( self::$utfCheckNFC[$c] ) ) {
226 # If it's NO or MAYBE, bail and do the slow check.
227 return false;
228 }
229 if ( isset( self::$utfCombiningClass[$c] ) ) {
230 # Combining character? We might have to do sorting, at least.
231 return false;
232 }
233 }
234
235 return true;
236 }
237
238 /**
239 * Returns true if the string is _definitely_ in NFC.
240 * Returns false if not or uncertain.
241 * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
242 * @return bool
243 */
244 static function quickIsNFCVerify( &$string ) {
245 # Screen out some characters that eg won't be allowed in XML
246 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
247
248 # ASCII is always valid NFC!
249 # If we're only ever given plain ASCII, we can avoid the overhead
250 # of initializing the decomposition tables by skipping out early.
251 if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
252
253 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
254 if ( !isset( $checkit ) ) {
255 # Load/build some scary lookup tables...
256 UtfNormal::loadData();
257
258 $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
259
260 # Head bytes for sequences which we should do further validity checks
261 $checkit = array_flip( array_map( 'chr',
262 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
263 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
264 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
265
266 # Each UTF-8 head byte is followed by a certain
267 # number of tail bytes.
268 $tailBytes = array();
269 for ( $n = 0; $n < 256; $n++ ) {
270 if ( $n < 0xc0 ) {
271 $remaining = 0;
272 } elseif ( $n < 0xe0 ) {
273 $remaining = 1;
274 } elseif ( $n < 0xf0 ) {
275 $remaining = 2;
276 } elseif ( $n < 0xf8 ) {
277 $remaining = 3;
278 } elseif ( $n < 0xfc ) {
279 $remaining = 4;
280 } elseif ( $n < 0xfe ) {
281 $remaining = 5;
282 } else {
283 $remaining = 0;
284 }
285 $tailBytes[chr( $n )] = $remaining;
286 }
287 }
288
289 # Chop the text into pure-ASCII and non-ASCII areas;
290 # large ASCII parts can be handled much more quickly.
291 # Don't chop up Unicode areas for punctuation, though,
292 # that wastes energy.
293 $matches = array();
294 preg_match_all(
295 '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
296 $string, $matches );
297
298 $looksNormal = true;
299 $base = 0;
300 $replace = array();
301 foreach ( $matches[1] as $str ) {
302 $chunk = strlen( $str );
303
304 if ( $str[0] < "\x80" ) {
305 # ASCII chunk: guaranteed to be valid UTF-8
306 # and in normal form C, so skip over it.
307 $base += $chunk;
308 continue;
309 }
310
311 # We'll have to examine the chunk byte by byte to ensure
312 # that it consists of valid UTF-8 sequences, and to see
313 # if any of them might not be normalized.
314 #
315 # Since PHP is not the fastest language on earth, some of
316 # this code is a little ugly with inner loop optimizations.
317
318 $head = '';
319 $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
320
321 for ( $i = -1; --$len; ) {
322 $remaining = $tailBytes[$c = $str[++$i]];
323 if ( $remaining ) {
324 # UTF-8 head byte!
325 $sequence = $head = $c;
326 do {
327 # Look for the defined number of tail bytes...
328 if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
329 # Legal tail bytes are nice.
330 $sequence .= $c;
331 } else {
332 if ( 0 == $len ) {
333 # Premature end of string!
334 # Drop a replacement character into output to
335 # represent the invalid UTF-8 sequence.
336 $replace[] = array( UTF8_REPLACEMENT,
337 $base + $i + 1 - strlen( $sequence ),
338 strlen( $sequence ) );
339 break 2;
340 } else {
341 # Illegal tail byte; abandon the sequence.
342 $replace[] = array( UTF8_REPLACEMENT,
343 $base + $i - strlen( $sequence ),
344 strlen( $sequence ) );
345 # Back up and reprocess this byte; it may itself
346 # be a legal ASCII or UTF-8 sequence head.
347 --$i;
348 ++$len;
349 continue 2;
350 }
351 }
352 } while ( --$remaining );
353
354 if ( isset( $checkit[$head] ) ) {
355 # Do some more detailed validity checks, for
356 # invalid characters and illegal sequences.
357 if ( $head == "\xed" ) {
358 # 0xed is relatively frequent in Korean, which
359 # abuts the surrogate area, so we're doing
360 # this check separately to speed things up.
361
362 if ( $sequence >= UTF8_SURROGATE_FIRST ) {
363 # Surrogates are legal only in UTF-16 code.
364 # They are totally forbidden here in UTF-8
365 # utopia.
366 $replace[] = array( UTF8_REPLACEMENT,
367 $base + $i + 1 - strlen( $sequence ),
368 strlen( $sequence ) );
369 $head = '';
370 continue;
371 }
372 } else {
373 # Slower, but rarer checks...
374 $n = ord( $head );
375 if (
376 # "Overlong sequences" are those that are syntactically
377 # correct but use more UTF-8 bytes than are necessary to
378 # encode a character. Naïve string comparisons can be
379 # tricked into failing to see a match for an ASCII
380 # character, for instance, which can be a security hole
381 # if blacklist checks are being used.
382 ( $n < 0xc2 && $sequence <= UTF8_OVERLONG_A )
383 || ( $n == 0xe0 && $sequence <= UTF8_OVERLONG_B )
384 || ( $n == 0xf0 && $sequence <= UTF8_OVERLONG_C )
385
386 # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
387 || ( $n == 0xef &&
388 ( $sequence == UTF8_FFFE )
389 || ( $sequence == UTF8_FFFF ) )
390
391 # Unicode has been limited to 21 bits; longer
392 # sequences are not allowed.
393 || ( $n >= 0xf0 && $sequence > UTF8_MAX )
394 ) {
395
396 $replace[] = array( UTF8_REPLACEMENT,
397 $base + $i + 1 - strlen( $sequence ),
398 strlen( $sequence ) );
399 $head = '';
400 continue;
401 }
402 }
403 }
404
405 if ( isset( $utfCheckOrCombining[$sequence] ) ) {
406 # If it's NO or MAYBE, we'll have to rip
407 # the string apart and put it back together.
408 # That's going to be mighty slow.
409 $looksNormal = false;
410 }
411
412 # The sequence is legal!
413 $head = '';
414 } elseif ( $c < "\x80" ) {
415 # ASCII byte.
416 $head = '';
417 } elseif ( $c < "\xc0" ) {
418 # Illegal tail bytes
419 if ( $head == '' ) {
420 # Out of the blue!
421 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
422 } else {
423 # Don't add if we're continuing a broken sequence;
424 # we already put a replacement character when we looked
425 # at the broken sequence.
426 $replace[] = array( '', $base + $i, 1 );
427 }
428 } else {
429 # Miscellaneous freaks.
430 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
431 $head = '';
432 }
433 }
434 $base += $chunk;
435 }
436 if ( count( $replace ) ) {
437 # There were illegal UTF-8 sequences we need to fix up.
438 $out = '';
439 $last = 0;
440 foreach ( $replace as $rep ) {
441 list( $replacement, $start, $length ) = $rep;
442 if ( $last < $start ) {
443 $out .= substr( $string, $last, $start - $last );
444 }
445 $out .= $replacement;
446 $last = $start + $length;
447 }
448 if ( $last < strlen( $string ) ) {
449 $out .= substr( $string, $last );
450 }
451 $string = $out;
452 }
453
454 return $looksNormal;
455 }
456
457 # These take a string and run the normalization on them, without
458 # checking for validity or any optimization etc. Input must be
459 # VALID UTF-8!
460 /**
461 * @param $string string
462 * @return string
463 * @private
464 */
465 static function NFC( $string ) {
466 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
467 }
468
469 /**
470 * @param $string string
471 * @return string
472 * @private
473 */
474 static function NFD( $string ) {
475 UtfNormal::loadData();
476
477 return UtfNormal::fastCombiningSort(
478 UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
479 }
480
481 /**
482 * @param $string string
483 * @return string
484 * @private
485 */
486 static function NFKC( $string ) {
487 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
488 }
489
490 /**
491 * @param $string string
492 * @return string
493 * @private
494 */
495 static function NFKD( $string ) {
496 if ( !isset( self::$utfCompatibilityDecomp ) ) {
497 require_once 'UtfNormalDataK.inc';
498 }
499
500 return self::fastCombiningSort(
501 self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
502 }
503
504 /**
505 * Perform decomposition of a UTF-8 string into either D or KD form
506 * (depending on which decomposition map is passed to us).
507 * Input is assumed to be *valid* UTF-8. Invalid code will break.
508 * @private
509 * @param string $string valid UTF-8 string
510 * @param array $map hash of expanded decomposition map
511 * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
512 */
513 static function fastDecompose( $string, $map ) {
514 UtfNormal::loadData();
515 $len = strlen( $string );
516 $out = '';
517 for ( $i = 0; $i < $len; $i++ ) {
518 $c = $string[$i];
519 $n = ord( $c );
520 if ( $n < 0x80 ) {
521 # ASCII chars never decompose
522 # THEY ARE IMMORTAL
523 $out .= $c;
524 continue;
525 } elseif ( $n >= 0xf0 ) {
526 $c = substr( $string, $i, 4 );
527 $i += 3;
528 } elseif ( $n >= 0xe0 ) {
529 $c = substr( $string, $i, 3 );
530 $i += 2;
531 } elseif ( $n >= 0xc0 ) {
532 $c = substr( $string, $i, 2 );
533 $i++;
534 }
535 if ( isset( $map[$c] ) ) {
536 $out .= $map[$c];
537 continue;
538 } else {
539 if ( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
540 # Decompose a hangul syllable into jamo;
541 # hardcoded for three-byte UTF-8 sequence.
542 # A lookup table would be slightly faster,
543 # but adds a lot of memory & disk needs.
544 #
545 $index = ( ( ord( $c[0] ) & 0x0f ) << 12
546 | ( ord( $c[1] ) & 0x3f ) << 6
547 | ( ord( $c[2] ) & 0x3f ) )
548 - UNICODE_HANGUL_FIRST;
549 $l = intval( $index / UNICODE_HANGUL_NCOUNT );
550 $v = intval( ( $index % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT );
551 $t = $index % UNICODE_HANGUL_TCOUNT;
552 $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
553 if ( $t >= 25 ) {
554 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
555 } elseif ( $t ) {
556 $out .= "\xe1\x86" . chr( 0xa7 + $t );
557 }
558 continue;
559 }
560 }
561 $out .= $c;
562 }
563
564 return $out;
565 }
566
567 /**
568 * Sorts combining characters into canonical order. This is the
569 * final step in creating decomposed normal forms D and KD.
570 * @private
571 * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
572 * @return string a UTF-8 string with combining characters sorted in canonical order
573 */
574 static function fastCombiningSort( $string ) {
575 UtfNormal::loadData();
576 $len = strlen( $string );
577 $out = '';
578 $combiners = array();
579 $lastClass = -1;
580 for ( $i = 0; $i < $len; $i++ ) {
581 $c = $string[$i];
582 $n = ord( $c );
583 if ( $n >= 0x80 ) {
584 if ( $n >= 0xf0 ) {
585 $c = substr( $string, $i, 4 );
586 $i += 3;
587 } elseif ( $n >= 0xe0 ) {
588 $c = substr( $string, $i, 3 );
589 $i += 2;
590 } elseif ( $n >= 0xc0 ) {
591 $c = substr( $string, $i, 2 );
592 $i++;
593 }
594 if ( isset( self::$utfCombiningClass[$c] ) ) {
595 $lastClass = self::$utfCombiningClass[$c];
596 if ( isset( $combiners[$lastClass] ) ) {
597 $combiners[$lastClass] .= $c;
598 } else {
599 $combiners[$lastClass] = $c;
600 }
601 continue;
602 }
603 }
604 if ( $lastClass ) {
605 ksort( $combiners );
606 $out .= implode( '', $combiners );
607 $combiners = array();
608 }
609 $out .= $c;
610 $lastClass = 0;
611 }
612 if ( $lastClass ) {
613 ksort( $combiners );
614 $out .= implode( '', $combiners );
615 }
616
617 return $out;
618 }
619
620 /**
621 * Produces canonically composed sequences, i.e. normal form C or KC.
622 *
623 * @private
624 * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
625 * @return string a UTF-8 string with canonical precomposed characters used where possible
626 */
627 static function fastCompose( $string ) {
628 UtfNormal::loadData();
629 $len = strlen( $string );
630 $out = '';
631 $lastClass = -1;
632 $lastHangul = 0;
633 $startChar = '';
634 $combining = '';
635 $x1 = ord( substr( UTF8_HANGUL_VBASE, 0, 1 ) );
636 $x2 = ord( substr( UTF8_HANGUL_TEND, 0, 1 ) );
637 for ( $i = 0; $i < $len; $i++ ) {
638 $c = $string[$i];
639 $n = ord( $c );
640 if ( $n < 0x80 ) {
641 # No combining characters here...
642 $out .= $startChar;
643 $out .= $combining;
644 $startChar = $c;
645 $combining = '';
646 $lastClass = 0;
647 continue;
648 } elseif ( $n >= 0xf0 ) {
649 $c = substr( $string, $i, 4 );
650 $i += 3;
651 } elseif ( $n >= 0xe0 ) {
652 $c = substr( $string, $i, 3 );
653 $i += 2;
654 } elseif ( $n >= 0xc0 ) {
655 $c = substr( $string, $i, 2 );
656 $i++;
657 }
658 $pair = $startChar . $c;
659 if ( $n > 0x80 ) {
660 if ( isset( self::$utfCombiningClass[$c] ) ) {
661 # A combining char; see what we can do with it
662 $class = self::$utfCombiningClass[$c];
663 if ( !empty( $startChar ) &&
664 $lastClass < $class &&
665 $class > 0 &&
666 isset( self::$utfCanonicalComp[$pair] )
667 ) {
668 $startChar = self::$utfCanonicalComp[$pair];
669 $class = 0;
670 } else {
671 $combining .= $c;
672 }
673 $lastClass = $class;
674 $lastHangul = 0;
675 continue;
676 }
677 }
678 # New start char
679 if ( $lastClass == 0 ) {
680 if ( isset( self::$utfCanonicalComp[$pair] ) ) {
681 $startChar = self::$utfCanonicalComp[$pair];
682 $lastHangul = 0;
683 continue;
684 }
685 if ( $n >= $x1 && $n <= $x2 ) {
686 # WARNING: Hangul code is painfully slow.
687 # I apologize for this ugly, ugly code; however
688 # performance is even more teh suck if we call
689 # out to nice clean functions. Lookup tables are
690 # marginally faster, but require a lot of space.
691 #
692 if ( $c >= UTF8_HANGUL_VBASE &&
693 $c <= UTF8_HANGUL_VEND &&
694 $startChar >= UTF8_HANGUL_LBASE &&
695 $startChar <= UTF8_HANGUL_LEND
696 ) {
697 #
698 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
699 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
700 $lIndex = ord( $startChar[2] ) - 0x80;
701 $vIndex = ord( $c[2] ) - 0xa1;
702
703 $hangulPoint = UNICODE_HANGUL_FIRST +
704 UNICODE_HANGUL_TCOUNT *
705 ( UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex );
706
707 # Hardcode the limited-range UTF-8 conversion:
708 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
709 chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
710 chr( $hangulPoint & 0x3f | 0x80 );
711 $lastHangul = 0;
712 continue;
713 } elseif ( $c >= UTF8_HANGUL_TBASE &&
714 $c <= UTF8_HANGUL_TEND &&
715 $startChar >= UTF8_HANGUL_FIRST &&
716 $startChar <= UTF8_HANGUL_LAST &&
717 !$lastHangul
718 ) {
719 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
720 $tIndex = ord( $c[2] ) - 0xa7;
721 if ( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 );
722
723 # Increment the code point by $tIndex, without
724 # the function overhead of decoding and recoding UTF-8
725 #
726 $tail = ord( $startChar[2] ) + $tIndex;
727 if ( $tail > 0xbf ) {
728 $tail -= 0x40;
729 $mid = ord( $startChar[1] ) + 1;
730 if ( $mid > 0xbf ) {
731 $startChar[0] = chr( ord( $startChar[0] ) + 1 );
732 $mid -= 0x40;
733 }
734 $startChar[1] = chr( $mid );
735 }
736 $startChar[2] = chr( $tail );
737
738 # If there's another jamo char after this, *don't* try to merge it.
739 $lastHangul = 1;
740 continue;
741 }
742 }
743 }
744 $out .= $startChar;
745 $out .= $combining;
746 $startChar = $c;
747 $combining = '';
748 $lastClass = 0;
749 $lastHangul = 0;
750 }
751 $out .= $startChar . $combining;
752
753 return $out;
754 }
755
756 /**
757 * This is just used for the benchmark, comparing how long it takes to
758 * interate through a string without really doing anything of substance.
759 * @param $string string
760 * @return string
761 */
762 static function placebo( $string ) {
763 $len = strlen( $string );
764 $out = '';
765 for ( $i = 0; $i < $len; $i++ ) {
766 $out .= $string[$i];
767 }
768
769 return $out;
770 }
771
772 /**
773 * Function to replace some characters that we don't want
774 * but most of the native normalize functions keep.
775 *
776 * @param string $string The string
777 * @return String String with the character codes replaced.
778 */
779 private static function replaceForNativeNormalize( $string ) {
780 $string = preg_replace(
781 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
782 UTF8_REPLACEMENT,
783 $string );
784 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
785 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
786
787 return $string;
788 }
789 }