From: Brion Vibber Date: Sat, 30 Oct 2004 12:06:31 +0000 (+0000) Subject: More inlining; fastCompose() is now twice as fast on hangul chars, which cuts down... X-Git-Tag: 1.5.0alpha1~1416 X-Git-Url: http://git.cyclocoop.org//%27%40script%40/%27?a=commitdiff_plain;h=286dd13042947920a5de3d98035c077972225117;p=lhc%2Fweb%2Fwiklou.git More inlining; fastCompose() is now twice as fast on hangul chars, which cuts down the NFC() time on Korean text a fair chunk. --- diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 9ffd108aa9..b1664fa6dc 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -447,7 +447,24 @@ class UtfNormal { $out .= $map[$c]; } else { if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) { - $out .= UtfNormal::decomposeHangul( $c ); + # Decompose a hangul syllable into jamo; + # hardcoded for three-byte UTF-8 sequence. + # A lookup table would be slightly faster, + # but adds a lot of memory & disk needs. + # + $index = ( (ord( $c{0} ) & 0x0f) << 12 + | (ord( $c{1} ) & 0x3f) << 6 + | (ord( $c{2} ) & 0x3f) ) + - UNICODE_HANGUL_FIRST; + $l = IntVal( $index / UNICODE_HANGUL_NCOUNT ); + $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT); + $t = $index % UNICODE_HANGUL_TCOUNT; + $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v ); + if( $t >= 25 ) { + $out .= "\xe1\x87" . chr( 0x80 + $t - 25 ); + } elseif( $t ) { + $out .= "\xe1\x86" . chr( 0xa7 + $t ); + } } else { $out .= $c; } @@ -456,24 +473,6 @@ class UtfNormal { return $out; } - /** - * Decompose a Hangul syllable character into its constituent jamo. - * @access private - * @param int $c Unicode code point of the character - * @return string a UTF-8 string containing a sequence of jamo - */ - function decomposeHangul( $c ) { - $codepoint = utf8ToCodepoint( $c ); - $index = $codepoint - UNICODE_HANGUL_FIRST; - $l = IntVal( $index / UNICODE_HANGUL_NCOUNT ); - $v = IntVal( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT); - $t = $index % UNICODE_HANGUL_TCOUNT; - $out = codepointToUtf8( $l + UNICODE_HANGUL_LBASE ); - $out .= codepointToUtf8( $v + UNICODE_HANGUL_VBASE ); - if( $t ) $out .= codepointToUtf8( $t + UNICODE_HANGUL_TBASE ); - return $out; - } - /** * Sorts combining characters into canonical order. This is the * final step in creating decomposed normal forms D and KD. @@ -484,23 +483,15 @@ class UtfNormal { function fastCombiningSort( $string ) { UtfNormal::loadData(); global $utfCombiningClass; - $replacedCount = 1; - while( $replacedCount > 0 ) { - $replacedCount = 0; - $len = strlen( $string ); - $out = ''; - $lastClass = -1; - $lastChar = ''; - for( $i = 0; $i < $len; $i++ ) { - $c = $string{$i}; - $n = ord( $c ); - if( $n < 0x80 ) { - # No combining characters in ASCII. - $out .= $lastChar; - $lastChar = $c; - $lastClass = 0; - continue; - } elseif( $n >= 0xf0 ) { + $len = strlen( $string ); + $out = ''; + $combiners = array(); + $lastClass = -1; + for( $i = 0; $i < $len; $i++ ) { + $c = $string{$i}; + $n = ord( $c ); + if( $n >= 0x80 ) { + if( $n >= 0xf0 ) { $c = substr( $string, $i, 4 ); $i += 3; } elseif( $n >= 0xe0 ) { @@ -510,31 +501,25 @@ class UtfNormal { $c = substr( $string, $i, 2 ); $i++; } - $class = 0; - if( $lastClass == -1 ) { - # First one - $lastChar = $c; - $class = isset( $utfCombiningClass[$c] ) ? $utfCombiningClass[$c] : 0; - $lastClass = $class; - continue; - } if( isset( $utfCombiningClass[$c] ) ) { - $class = $utfCombiningClass[$c]; - if( $lastClass > $class ) { - # Swap -- put this one on the stack - $out .= $c; - $replacedCount++; - continue; - } + $lastClass = $utfCombiningClass[$c]; + @$combiners[$lastClass] .= $c; + continue; } - $out .= $lastChar; - $lastChar = $c; - $lastClass = $class; } - $out .= $lastChar; - $string = $out; + if( $lastClass ) { + ksort( $combiners ); + $out .= implode( '', $combiners ); + $combiners = array(); + } + $out .= $c; + $lastClass = 0; + } + if( $lastClass ) { + ksort( $combiners ); + $out .= implode( '', $combiners ); } - return $string; + return $out; } /** @@ -601,25 +586,52 @@ class UtfNormal { } if( $n >= $x1 && $n <= $x2 ) { # WARNING: Hangul code is painfully slow. + # I apologize for this ugly, ugly code; however + # performance is even more teh suck if we call + # out to nice clean functions. Lookup tables are + # marginally faster, but require a lot of space. + # if( $c >= UTF8_HANGUL_VBASE && $c <= UTF8_HANGUL_VEND && $startChar >= UTF8_HANGUL_LBASE && $startChar <= UTF8_HANGUL_LEND ) { # - $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; - $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; + #$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; + #$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; + $lIndex = ord( $startChar{2} ) - 0x80; + $vIndex = ord( $c{2} ) - 0xa1; + $hangulPoint = UNICODE_HANGUL_FIRST + UNICODE_HANGUL_TCOUNT * (UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex); - $startChar = codepointToUtf8( $hangulPoint ); + + # Hardcode the limited-range UTF-8 conversion: + $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) . + chr( $hangulPoint >> 6 & 0x3f | 0x80 ) . + chr( $hangulPoint & 0x3f | 0x80 ); continue; } elseif( $c >= UTF8_HANGUL_TBASE && $c <= UTF8_HANGUL_TEND && $startChar >= UTF8_HANGUL_FIRST && $startChar <= UTF8_HANGUL_LAST ) { - $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; - $hangulPoint = utf8ToCodepoint( $startChar ) + $tIndex; - $startChar = codepointToUtf8( $hangulPoint ); + # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; + $tIndex = ord( $c{2} ) - 0xa7; + if( $tIndex < 0 ) $tIndex = ord( $c{2} ) - 0x80 + (0x11c0 - 0x11a7); + + # Increment the code point by $tIndex, without + # the function overhead of decoding and recoding UTF-8 + # + $tail = ord( $startChar{2} ) + $tIndex; + if( $tail > 0xbf ) { + $tail -= 0x40; + $mid = ord( $startChar{1} ) + 1; + if( $mid > 0xbf ) { + $startChar{0} = chr( ord( $startChar{0} ) + 1 ); + $mid -= 0x40; + } + $startChar{1} = chr( $mid ); + } + $startChar{2} = chr( $tail ); continue; } }