Marked all functions as static
[lhc/web/wiklou.git] / includes / normal / UtfNormal.php
1 <?php
2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19
20 /**
21 * Unicode normalization routines for working with UTF-8 strings.
22 * Currently assumes that input strings are valid UTF-8!
23 *
24 * Not as fast as I'd like, but should be usable for most purposes.
25 * UtfNormal::toNFC() will bail early if given ASCII text or text
26 * it can quickly deterimine is already normalized.
27 *
28 * All functions can be called static.
29 *
30 * See description of forms at http://www.unicode.org/reports/tr15/
31 *
32 * @package UtfNormal
33 */
34
35 /** */
36 require_once dirname(__FILE__).'/UtfNormalUtil.php';
37
38 global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;
39 $utfCombiningClass = NULL;
40 $utfCanonicalComp = NULL;
41 $utfCanonicalDecomp = NULL;
42
43 # Load compatibility decompositions on demand if they are needed.
44 global $utfCompatibilityDecomp;
45 $utfCompatibilityDecomp = NULL;
46
47 define( 'UNICODE_HANGUL_FIRST', 0xac00 );
48 define( 'UNICODE_HANGUL_LAST', 0xd7a3 );
49
50 define( 'UNICODE_HANGUL_LBASE', 0x1100 );
51 define( 'UNICODE_HANGUL_VBASE', 0x1161 );
52 define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
53
54 define( 'UNICODE_HANGUL_LCOUNT', 19 );
55 define( 'UNICODE_HANGUL_VCOUNT', 21 );
56 define( 'UNICODE_HANGUL_TCOUNT', 28 );
57 define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
58
59 define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
60 define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
61 define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
62
63 define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
64 define( 'UNICODE_SURROGATE_LAST', 0xdfff );
65 define( 'UNICODE_MAX', 0x10ffff );
66 define( 'UNICODE_REPLACEMENT', 0xfffd );
67
68
69 define( 'UTF8_HANGUL_FIRST', "\xea\xb0\x80" /*codepointToUtf8( UNICODE_HANGUL_FIRST )*/ );
70 define( 'UTF8_HANGUL_LAST', "\xed\x9e\xa3" /*codepointToUtf8( UNICODE_HANGUL_LAST )*/ );
71
72 define( 'UTF8_HANGUL_LBASE', "\xe1\x84\x80" /*codepointToUtf8( UNICODE_HANGUL_LBASE )*/ );
73 define( 'UTF8_HANGUL_VBASE', "\xe1\x85\xa1" /*codepointToUtf8( UNICODE_HANGUL_VBASE )*/ );
74 define( 'UTF8_HANGUL_TBASE', "\xe1\x86\xa7" /*codepointToUtf8( UNICODE_HANGUL_TBASE )*/ );
75
76 define( 'UTF8_HANGUL_LEND', "\xe1\x84\x92" /*codepointToUtf8( UNICODE_HANGUL_LEND )*/ );
77 define( 'UTF8_HANGUL_VEND', "\xe1\x85\xb5" /*codepointToUtf8( UNICODE_HANGUL_VEND )*/ );
78 define( 'UTF8_HANGUL_TEND', "\xe1\x87\x82" /*codepointToUtf8( UNICODE_HANGUL_TEND )*/ );
79
80 define( 'UTF8_SURROGATE_FIRST', "\xed\xa0\x80" /*codepointToUtf8( UNICODE_SURROGATE_FIRST )*/ );
81 define( 'UTF8_SURROGATE_LAST', "\xed\xbf\xbf" /*codepointToUtf8( UNICODE_SURROGATE_LAST )*/ );
82 define( 'UTF8_MAX', "\xf4\x8f\xbf\xbf" /*codepointToUtf8( UNICODE_MAX )*/ );
83 define( 'UTF8_REPLACEMENT', "\xef\xbf\xbd" /*codepointToUtf8( UNICODE_REPLACEMENT )*/ );
84 #define( 'UTF8_REPLACEMENT', '!' );
85
86 define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
87 define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
88 define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
89
90 # These two ranges are illegal
91 define( 'UTF8_FDD0', "\xef\xb7\x90" /*codepointToUtf8( 0xfdd0 )*/ );
92 define( 'UTF8_FDEF', "\xef\xb7\xaf" /*codepointToUtf8( 0xfdef )*/ );
93 define( 'UTF8_FFFE', "\xef\xbf\xbe" /*codepointToUtf8( 0xfffe )*/ );
94 define( 'UTF8_FFFF', "\xef\xbf\xbf" /*codepointToUtf8( 0xffff )*/ );
95
96 define( 'UTF8_HEAD', false );
97 define( 'UTF8_TAIL', true );
98
99
100 /**
101 * For using the ICU wrapper
102 */
103 define( 'UNORM_NONE', 1 );
104 define( 'UNORM_NFD', 2 );
105 define( 'UNORM_NFKD', 3 );
106 define( 'UNORM_NFC', 4 );
107 define( 'UNORM_DEFAULT', UNORM_NFC );
108 define( 'UNORM_NFKC', 5 );
109 define( 'UNORM_FCD', 6 );
110
111 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
112
113 /**
114 *
115 * @package MediaWiki
116 */
117 class UtfNormal {
118 /**
119 * The ultimate convenience function! Clean up invalid UTF-8 sequences,
120 * and convert to normal form C, canonical composition.
121 *
122 * Fast return for pure ASCII strings; some lesser optimizations for
123 * strings containing only known-good characters. Not as fast as toNFC().
124 *
125 * @param string $string a UTF-8 string
126 * @return string a clean, shiny, normalized UTF-8 string
127 * @static
128 */
129 static function cleanUp( $string ) {
130 if( NORMALIZE_ICU ) {
131 # We exclude a few chars that ICU would not.
132 $string = preg_replace(
133 '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
134 UTF8_REPLACEMENT,
135 $string );
136 $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
137 $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
138
139 # UnicodeString constructor fails if the string ends with a
140 # head byte. Add a junk char at the end, we'll strip it off.
141 return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
142 } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
143 # Side effect -- $string has had UTF-8 errors cleaned up.
144 return $string;
145 } else {
146 return UtfNormal::NFC( $string );
147 }
148 }
149
150 /**
151 * Convert a UTF-8 string to normal form C, canonical composition.
152 * Fast return for pure ASCII strings; some lesser optimizations for
153 * strings containing only known-good characters.
154 *
155 * @param string $string a valid UTF-8 string. Input is not validated.
156 * @return string a UTF-8 string in normal form C
157 * @static
158 */
159 static function toNFC( $string ) {
160 if( NORMALIZE_ICU )
161 return utf8_normalize( $string, UNORM_NFC );
162 elseif( UtfNormal::quickIsNFC( $string ) )
163 return $string;
164 else
165 return UtfNormal::NFC( $string );
166 }
167
168 /**
169 * Convert a UTF-8 string to normal form D, canonical decomposition.
170 * Fast return for pure ASCII strings.
171 *
172 * @param string $string a valid UTF-8 string. Input is not validated.
173 * @return string a UTF-8 string in normal form D
174 * @static
175 */
176 static function toNFD( $string ) {
177 if( NORMALIZE_ICU )
178 return utf8_normalize( $string, UNORM_NFD );
179 elseif( preg_match( '/[\x80-\xff]/', $string ) )
180 return UtfNormal::NFD( $string );
181 else
182 return $string;
183 }
184
185 /**
186 * Convert a UTF-8 string to normal form KC, compatibility composition.
187 * This may cause irreversible information loss, use judiciously.
188 * Fast return for pure ASCII strings.
189 *
190 * @param string $string a valid UTF-8 string. Input is not validated.
191 * @return string a UTF-8 string in normal form KC
192 * @static
193 */
194 static function toNFKC( $string ) {
195 if( NORMALIZE_ICU )
196 return utf8_normalize( $string, UNORM_NFKC );
197 elseif( preg_match( '/[\x80-\xff]/', $string ) )
198 return UtfNormal::NFKC( $string );
199 else
200 return $string;
201 }
202
203 /**
204 * Convert a UTF-8 string to normal form KD, compatibility decomposition.
205 * This may cause irreversible information loss, use judiciously.
206 * Fast return for pure ASCII strings.
207 *
208 * @param string $string a valid UTF-8 string. Input is not validated.
209 * @return string a UTF-8 string in normal form KD
210 * @static
211 */
212 static function toNFKD( $string ) {
213 if( NORMALIZE_ICU )
214 return utf8_normalize( $string, UNORM_NFKD );
215 elseif( preg_match( '/[\x80-\xff]/', $string ) )
216 return UtfNormal::NFKD( $string );
217 else
218 return $string;
219 }
220
221 /**
222 * Load the basic composition data if necessary
223 * @private
224 * @static
225 */
226 static function loadData() {
227 # fixme : are $utfCanonicalComp, $utfCanonicalDecomp really used?
228 global $utfCombiningClass, $utfCanonicalComp, $utfCanonicalDecomp;
229 if( !isset( $utfCombiningClass ) ) {
230 require_once( 'UtfNormalData.inc' );
231 }
232 }
233
234 /**
235 * Returns true if the string is _definitely_ in NFC.
236 * Returns false if not or uncertain.
237 * @param string $string a valid UTF-8 string. Input is not validated.
238 * @return bool
239 * @static
240 */
241 static function quickIsNFC( $string ) {
242 # ASCII is always valid NFC!
243 # If it's pure ASCII, let it through.
244 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
245
246 UtfNormal::loadData();
247 global $utfCheckNFC, $utfCombiningClass;
248 $len = strlen( $string );
249 for( $i = 0; $i < $len; $i++ ) {
250 $c = $string{$i};
251 $n = ord( $c );
252 if( $n < 0x80 ) {
253 continue;
254 } elseif( $n >= 0xf0 ) {
255 $c = substr( $string, $i, 4 );
256 $i += 3;
257 } elseif( $n >= 0xe0 ) {
258 $c = substr( $string, $i, 3 );
259 $i += 2;
260 } elseif( $n >= 0xc0 ) {
261 $c = substr( $string, $i, 2 );
262 $i++;
263 }
264 if( isset( $utfCheckNFC[$c] ) ) {
265 # If it's NO or MAYBE, bail and do the slow check.
266 return false;
267 }
268 if( isset( $utfCombiningClass[$c] ) ) {
269 # Combining character? We might have to do sorting, at least.
270 return false;
271 }
272 }
273 return true;
274 }
275
276 /**
277 * Returns true if the string is _definitely_ in NFC.
278 * Returns false if not or uncertain.
279 * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
280 * @static
281 */
282 static function quickIsNFCVerify( &$string ) {
283 # Screen out some characters that eg won't be allowed in XML
284 $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
285
286 # ASCII is always valid NFC!
287 # If we're only ever given plain ASCII, we can avoid the overhead
288 # of initializing the decomposition tables by skipping out early.
289 if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
290
291 static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
292 if( !isset( $checkit ) ) {
293 # Load/build some scary lookup tables...
294 UtfNormal::loadData();
295 global $utfCheckNFC, $utfCombiningClass;
296
297 $utfCheckOrCombining = array_merge( $utfCheckNFC, $utfCombiningClass );
298
299 # Head bytes for sequences which we should do further validity checks
300 $checkit = array_flip( array_map( 'chr',
301 array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
302 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
303 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
304
305 # Each UTF-8 head byte is followed by a certain
306 # number of tail bytes.
307 $tailBytes = array();
308 for( $n = 0; $n < 256; $n++ ) {
309 if( $n < 0xc0 ) {
310 $remaining = 0;
311 } elseif( $n < 0xe0 ) {
312 $remaining = 1;
313 } elseif( $n < 0xf0 ) {
314 $remaining = 2;
315 } elseif( $n < 0xf8 ) {
316 $remaining = 3;
317 } elseif( $n < 0xfc ) {
318 $remaining = 4;
319 } elseif( $n < 0xfe ) {
320 $remaining = 5;
321 } else {
322 $remaining = 0;
323 }
324 $tailBytes[chr($n)] = $remaining;
325 }
326 }
327
328 # Chop the text into pure-ASCII and non-ASCII areas;
329 # large ASCII parts can be handled much more quickly.
330 # Don't chop up Unicode areas for punctuation, though,
331 # that wastes energy.
332 preg_match_all(
333 '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
334 $string, $matches );
335
336 $looksNormal = true;
337 $base = 0;
338 $replace = array();
339 foreach( $matches[1] as $str ) {
340 $chunk = strlen( $str );
341
342 if( $str{0} < "\x80" ) {
343 # ASCII chunk: guaranteed to be valid UTF-8
344 # and in normal form C, so skip over it.
345 $base += $chunk;
346 continue;
347 }
348
349 # We'll have to examine the chunk byte by byte to ensure
350 # that it consists of valid UTF-8 sequences, and to see
351 # if any of them might not be normalized.
352 #
353 # Since PHP is not the fastest language on earth, some of
354 # this code is a little ugly with inner loop optimizations.
355
356 $head = '';
357 $len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
358
359 for( $i = -1; --$len; ) {
360 if( $remaining = $tailBytes[$c = $str{++$i}] ) {
361 # UTF-8 head byte!
362 $sequence = $head = $c;
363 do {
364 # Look for the defined number of tail bytes...
365 if( --$len && ( $c = $str{++$i} ) >= "\x80" && $c < "\xc0" ) {
366 # Legal tail bytes are nice.
367 $sequence .= $c;
368 } else {
369 if( 0 == $len ) {
370 # Premature end of string!
371 # Drop a replacement character into output to
372 # represent the invalid UTF-8 sequence.
373 $replace[] = array( UTF8_REPLACEMENT,
374 $base + $i + 1 - strlen( $sequence ),
375 strlen( $sequence ) );
376 break 2;
377 } else {
378 # Illegal tail byte; abandon the sequence.
379 $replace[] = array( UTF8_REPLACEMENT,
380 $base + $i - strlen( $sequence ),
381 strlen( $sequence ) );
382 # Back up and reprocess this byte; it may itself
383 # be a legal ASCII or UTF-8 sequence head.
384 --$i;
385 ++$len;
386 continue 2;
387 }
388 }
389 } while( --$remaining );
390
391 if( isset( $checkit[$head] ) ) {
392 # Do some more detailed validity checks, for
393 # invalid characters and illegal sequences.
394 if( $head == "\xed" ) {
395 # 0xed is relatively frequent in Korean, which
396 # abuts the surrogate area, so we're doing
397 # this check separately to speed things up.
398
399 if( $sequence >= UTF8_SURROGATE_FIRST ) {
400 # Surrogates are legal only in UTF-16 code.
401 # They are totally forbidden here in UTF-8
402 # utopia.
403 $replace[] = array( UTF8_REPLACEMENT,
404 $base + $i + 1 - strlen( $sequence ),
405 strlen( $sequence ) );
406 $head = '';
407 continue;
408 }
409 } else {
410 # Slower, but rarer checks...
411 $n = ord( $head );
412 if(
413 # "Overlong sequences" are those that are syntactically
414 # correct but use more UTF-8 bytes than are necessary to
415 # encode a character. Naïve string comparisons can be
416 # tricked into failing to see a match for an ASCII
417 # character, for instance, which can be a security hole
418 # if blacklist checks are being used.
419 ($n < 0xc2 && $sequence <= UTF8_OVERLONG_A)
420 || ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
421 || ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
422
423 # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
424 || ($n == 0xef &&
425 ($sequence == UTF8_FFFE)
426 || ($sequence == UTF8_FFFF) )
427
428 # Unicode has been limited to 21 bits; longer
429 # sequences are not allowed.
430 || ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
431
432 $replace[] = array( UTF8_REPLACEMENT,
433 $base + $i + 1 - strlen( $sequence ),
434 strlen( $sequence ) );
435 $head = '';
436 continue;
437 }
438 }
439 }
440
441 if( isset( $utfCheckOrCombining[$sequence] ) ) {
442 # If it's NO or MAYBE, we'll have to rip
443 # the string apart and put it back together.
444 # That's going to be mighty slow.
445 $looksNormal = false;
446 }
447
448 # The sequence is legal!
449 $head = '';
450 } elseif( $c < "\x80" ) {
451 # ASCII byte.
452 $head = '';
453 } elseif( $c < "\xc0" ) {
454 # Illegal tail bytes
455 if( $head == '' ) {
456 # Out of the blue!
457 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
458 } else {
459 # Don't add if we're continuing a broken sequence;
460 # we already put a replacement character when we looked
461 # at the broken sequence.
462 $replace[] = array( '', $base + $i, 1 );
463 }
464 } else {
465 # Miscellaneous freaks.
466 $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
467 $head = '';
468 }
469 }
470 $base += $chunk;
471 }
472 if( count( $replace ) ) {
473 # There were illegal UTF-8 sequences we need to fix up.
474 $out = '';
475 $last = 0;
476 foreach( $replace as $rep ) {
477 list( $replacement, $start, $length ) = $rep;
478 if( $last < $start ) {
479 $out .= substr( $string, $last, $start - $last );
480 }
481 $out .= $replacement;
482 $last = $start + $length;
483 }
484 if( $last < strlen( $string ) ) {
485 $out .= substr( $string, $last );
486 }
487 $string = $out;
488 }
489 return $looksNormal;
490 }
491
492 # These take a string and run the normalization on them, without
493 # checking for validity or any optimization etc. Input must be
494 # VALID UTF-8!
495 /**
496 * @param string $string
497 * @return string
498 * @private
499 * @static
500 */
501 static function NFC( $string ) {
502 return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
503 }
504
505 /**
506 * @param string $string
507 * @return string
508 * @private
509 * @static
510 */
511 static function NFD( $string ) {
512 UtfNormal::loadData();
513 global $utfCanonicalDecomp;
514 return UtfNormal::fastCombiningSort(
515 UtfNormal::fastDecompose( $string, $utfCanonicalDecomp ) );
516 }
517
518 /**
519 * @param string $string
520 * @return string
521 * @private
522 * @static
523 */
524 static function NFKC( $string ) {
525 return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
526 }
527
528 /**
529 * @param string $string
530 * @return string
531 * @private
532 * @static
533 */
534 static function NFKD( $string ) {
535 global $utfCompatibilityDecomp;
536 if( !isset( $utfCompatibilityDecomp ) ) {
537 require_once( 'UtfNormalDataK.inc' );
538 }
539 return UtfNormal::fastCombiningSort(
540 UtfNormal::fastDecompose( $string, $utfCompatibilityDecomp ) );
541 }
542
543
544 /**
545 * Perform decomposition of a UTF-8 string into either D or KD form
546 * (depending on which decomposition map is passed to us).
547 * Input is assumed to be *valid* UTF-8. Invalid code will break.
548 * @private
549 * @param string $string Valid UTF-8 string
550 * @param array $map hash of expanded decomposition map
551 * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
552 * @static
553 */
554 static function fastDecompose( $string, &$map ) {
555 UtfNormal::loadData();
556 $len = strlen( $string );
557 $out = '';
558 for( $i = 0; $i < $len; $i++ ) {
559 $c = $string{$i};
560 $n = ord( $c );
561 if( $n < 0x80 ) {
562 # ASCII chars never decompose
563 # THEY ARE IMMORTAL
564 $out .= $c;
565 continue;
566 } elseif( $n >= 0xf0 ) {
567 $c = substr( $string, $i, 4 );
568 $i += 3;
569 } elseif( $n >= 0xe0 ) {
570 $c = substr( $string, $i, 3 );
571 $i += 2;
572 } elseif( $n >= 0xc0 ) {
573 $c = substr( $string, $i, 2 );
574 $i++;
575 }
576 if( isset( $map[$c] ) ) {
577 $out .= $map[$c];
578 continue;
579 } else {
580 if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
581 # Decompose a hangul syllable into jamo;
582 # hardcoded for three-byte UTF-8 sequence.
583 # A lookup table would be slightly faster,
584 # but adds a lot of memory & disk needs.
585 #
586 $index = ( (ord( $c{0} ) & 0x0f) << 12
587 | (ord( $c{1} ) & 0x3f) << 6
588 | (ord( $c{2} ) & 0x3f) )
589 - UNICODE_HANGUL_FIRST;
590 $l = intval( $index / UNICODE_HANGUL_NCOUNT );
591 $v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
592 $t = $index % UNICODE_HANGUL_TCOUNT;
593 $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
594 if( $t >= 25 ) {
595 $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
596 } elseif( $t ) {
597 $out .= "\xe1\x86" . chr( 0xa7 + $t );
598 }
599 continue;
600 }
601 }
602 $out .= $c;
603 }
604 return $out;
605 }
606
607 /**
608 * Sorts combining characters into canonical order. This is the
609 * final step in creating decomposed normal forms D and KD.
610 * @private
611 * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
612 * @return string a UTF-8 string with combining characters sorted in canonical order
613 * @static
614 */
615 static function fastCombiningSort( $string ) {
616 UtfNormal::loadData();
617 global $utfCombiningClass;
618 $len = strlen( $string );
619 $out = '';
620 $combiners = array();
621 $lastClass = -1;
622 for( $i = 0; $i < $len; $i++ ) {
623 $c = $string{$i};
624 $n = ord( $c );
625 if( $n >= 0x80 ) {
626 if( $n >= 0xf0 ) {
627 $c = substr( $string, $i, 4 );
628 $i += 3;
629 } elseif( $n >= 0xe0 ) {
630 $c = substr( $string, $i, 3 );
631 $i += 2;
632 } elseif( $n >= 0xc0 ) {
633 $c = substr( $string, $i, 2 );
634 $i++;
635 }
636 if( isset( $utfCombiningClass[$c] ) ) {
637 $lastClass = $utfCombiningClass[$c];
638 @$combiners[$lastClass] .= $c;
639 continue;
640 }
641 }
642 if( $lastClass ) {
643 ksort( $combiners );
644 $out .= implode( '', $combiners );
645 $combiners = array();
646 }
647 $out .= $c;
648 $lastClass = 0;
649 }
650 if( $lastClass ) {
651 ksort( $combiners );
652 $out .= implode( '', $combiners );
653 }
654 return $out;
655 }
656
657 /**
658 * Produces canonically composed sequences, i.e. normal form C or KC.
659 *
660 * @private
661 * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
662 * @return string a UTF-8 string with canonical precomposed characters used where possible
663 * @static
664 */
665 static function fastCompose( $string ) {
666 UtfNormal::loadData();
667 global $utfCanonicalComp, $utfCombiningClass;
668 $len = strlen( $string );
669 $out = '';
670 $lastClass = -1;
671 $lastHangul = 0;
672 $startChar = '';
673 $combining = '';
674 $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
675 $x2 = ord(substr(UTF8_HANGUL_TEND,0,1));
676 for( $i = 0; $i < $len; $i++ ) {
677 $c = $string{$i};
678 $n = ord( $c );
679 if( $n < 0x80 ) {
680 # No combining characters here...
681 $out .= $startChar;
682 $out .= $combining;
683 $startChar = $c;
684 $combining = '';
685 $lastClass = 0;
686 continue;
687 } elseif( $n >= 0xf0 ) {
688 $c = substr( $string, $i, 4 );
689 $i += 3;
690 } elseif( $n >= 0xe0 ) {
691 $c = substr( $string, $i, 3 );
692 $i += 2;
693 } elseif( $n >= 0xc0 ) {
694 $c = substr( $string, $i, 2 );
695 $i++;
696 }
697 $pair = $startChar . $c;
698 if( $n > 0x80 ) {
699 if( isset( $utfCombiningClass[$c] ) ) {
700 # A combining char; see what we can do with it
701 $class = $utfCombiningClass[$c];
702 if( !empty( $startChar ) &&
703 $lastClass < $class &&
704 $class > 0 &&
705 isset( $utfCanonicalComp[$pair] ) ) {
706 $startChar = $utfCanonicalComp[$pair];
707 $class = 0;
708 } else {
709 $combining .= $c;
710 }
711 $lastClass = $class;
712 $lastHangul = 0;
713 continue;
714 }
715 }
716 # New start char
717 if( $lastClass == 0 ) {
718 if( isset( $utfCanonicalComp[$pair] ) ) {
719 $startChar = $utfCanonicalComp[$pair];
720 $lastHangul = 0;
721 continue;
722 }
723 if( $n >= $x1 && $n <= $x2 ) {
724 # WARNING: Hangul code is painfully slow.
725 # I apologize for this ugly, ugly code; however
726 # performance is even more teh suck if we call
727 # out to nice clean functions. Lookup tables are
728 # marginally faster, but require a lot of space.
729 #
730 if( $c >= UTF8_HANGUL_VBASE &&
731 $c <= UTF8_HANGUL_VEND &&
732 $startChar >= UTF8_HANGUL_LBASE &&
733 $startChar <= UTF8_HANGUL_LEND ) {
734 #
735 #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
736 #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
737 $lIndex = ord( $startChar{2} ) - 0x80;
738 $vIndex = ord( $c{2} ) - 0xa1;
739
740 $hangulPoint = UNICODE_HANGUL_FIRST +
741 UNICODE_HANGUL_TCOUNT *
742 (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
743
744 # Hardcode the limited-range UTF-8 conversion:
745 $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
746 chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
747 chr( $hangulPoint & 0x3f | 0x80 );
748 $lastHangul = 0;
749 continue;
750 } elseif( $c >= UTF8_HANGUL_TBASE &&
751 $c <= UTF8_HANGUL_TEND &&
752 $startChar >= UTF8_HANGUL_FIRST &&
753 $startChar <= UTF8_HANGUL_LAST &&
754 !$lastHangul ) {
755 # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
756 $tIndex = ord( $c{2} ) - 0xa7;
757 if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7);
758
759 # Increment the code point by $tIndex, without
760 # the function overhead of decoding and recoding UTF-8
761 #
762 $tail = ord( $startChar{2} ) + $tIndex;
763 if( $tail > 0xbf ) {
764 $tail -= 0x40;
765 $mid = ord( $startChar{1} ) + 1;
766 if( $mid > 0xbf ) {
767 $startChar{0} = chr( ord( $startChar{0} ) + 1 );
768 $mid -= 0x40;
769 }
770 $startChar{1} = chr( $mid );
771 }
772 $startChar{2} = chr( $tail );
773
774 # If there's another jamo char after this, *don't* try to merge it.
775 $lastHangul = 1;
776 continue;
777 }
778 }
779 }
780 $out .= $startChar;
781 $out .= $combining;
782 $startChar = $c;
783 $combining = '';
784 $lastClass = 0;
785 $lastHangul = 0;
786 }
787 $out .= $startChar . $combining;
788 return $out;
789 }
790
791 /**
792 * This is just used for the benchmark, comparing how long it takes to
793 * interate through a string without really doing anything of substance.
794 * @param string $string
795 * @return string
796 * @static
797 */
798 static function placebo( $string ) {
799 $len = strlen( $string );
800 $out = '';
801 for( $i = 0; $i < $len; $i++ ) {
802 $out .= $string{$i};
803 }
804 return $out;
805 }
806 }
807
808 ?>