From: Brion Vibber Date: Wed, 26 Aug 2009 05:51:21 +0000 (+0000) Subject: Replace our mb_substr() fallback implementation with one which is not quite so horrib... X-Git-Tag: 1.31.0-rc.0~40056 X-Git-Url: https://git.cyclocoop.org/%27.WWW_URL.%27admin/?a=commitdiff_plain;h=1505db42a26f9a3e7b3934d8d48e190e39a06924;p=lhc%2Fweb%2Fwiklou.git Replace our mb_substr() fallback implementation with one which is not quite so horrible... While not too awful on smallish strings, the way it worked was *murder* on large input: the *entire string* would be broken up into an array of individual characters, sliced up, then merged back together. In my testing I couldn't even get the function to complete in a reasonable time for, say, 127k worth of text... not only did the regex split take forever, but it would eat an insane amount of memory, likely triggering memory_limit hits in a sane world. The new implementation counts characters from the beginning or end of a string to determine the byte-based offsets to use for substr() start and count parameters, and only uses a couple temporary dupes of the string in memory. For typical short offset/count cases (take or trim one or a few characters) this performs about 3-5x worse than native mb_substr() for in my testing. Large offsets are optimized by first skipping the same number of bytes as characters, since all characters take at least one byte. On primarily Latin text this made some of my test cases actually *faster* than native mb_substr()! ;) For non-Latin texts this takes out a fair chunk of our work, but can still leave us with very slow execution -- eg ~30ms to get through a few dozens of kilobytes worth of offset on Japanese text. But at least it completes now! This could probably be optimized further, perhaps skipping progressively smaller chunks in binary-chop fashion. :) For fun, my profiling results (profiling & test scripts are in a little git repo which I would push to gitorious to poke at, but gitorious hates me right now and won't finish my repo setup): strlen mb_strlen short ascii - 0.0019ms - 19 strlen xmb_strlen short ascii - 0.0672ms - 19 strlen mb_strlen short unicode - 0.0019ms - 19 strlen xmb_strlen short unicode - 0.0657ms - 19 strlen mb_strlen long ascii - 0.0826ms - 20000 strlen xmb_strlen long ascii - 0.1236ms - 20000 strlen mb_strlen long unicode - 0.0774ms - 20000 strlen xmb_strlen long unicode - 0.1901ms - 20000 strlen mb_strlen san francisco - 0.4775ms - 126700 strlen xmb_strlen san francisco - 0.4474ms - 126700 substr mb_substr short ascii first - 0.0022ms - 1-byte string ("s") <- native substr xmb_substr short ascii first - 0.0168ms - 1-byte string ("s") <- old fallback substr xmb_substr3 short ascii first - 0.0069ms - 1-byte string ("s") <- new fallback substr mb_substr short ascii last - 0.0023ms - 1-byte string ("s") substr xmb_substr short ascii last - 0.0171ms - 1-byte string ("s") substr xmb_substr3 short ascii last - 0.0113ms - 1-byte string ("s") substr mb_substr short ascii trim last 9 - 0.0023ms - 10-byte string ("short asci") substr xmb_substr short ascii trim last 9 - 0.0183ms - 10-byte string ("short asci") substr xmb_substr3 short ascii trim last 9 - 0.0119ms - 10-byte string ("short asci") substr mb_substr short ascii middle 3 - 0.0022ms - 3-byte string ("sci") substr xmb_substr short ascii middle 3 - 0.0171ms - 3-byte string ("sci") substr xmb_substr3 short ascii middle 3 - 0.0149ms - 3-byte string ("sci") substr mb_substr short unicode first - 0.0022ms - 1-byte string ("s") substr xmb_substr short unicode first - 0.0184ms - 1-byte string ("s") substr xmb_substr3 short unicode first - 0.0071ms - 1-byte string ("s") substr mb_substr short unicode last - 0.0026ms - 2-byte string ("ß") substr xmb_substr short unicode last - 0.0187ms - 2-byte string ("ß") substr xmb_substr3 short unicode last - 0.0130ms - 2-byte string ("ß") substr mb_substr short unicode trim last 9 - 0.0024ms - 14-byte string ("short áéíó") substr xmb_substr short unicode trim last 9 - 0.0200ms - 14-byte string ("short áéíó") substr xmb_substr3 short unicode trim last 9 - 0.0137ms - 14-byte string ("short áéíó") substr mb_substr short unicode middle 3 - 0.0022ms - 6-byte string ("éíó") substr xmb_substr short unicode middle 3 - 0.0188ms - 6-byte string ("éíó") substr xmb_substr3 short unicode middle 3 - 0.0189ms - 6-byte string ("éíó") substr mb_substr san fran first - 0.0022ms - 1-byte string ("{") substr xmb_substr3 san fran first - 0.0069ms - 1-byte string ("{") substr mb_substr san fran last - 0.8914ms - 1-byte string ("\n") substr xmb_substr3 san fran last - 0.0109ms - 1-byte string ("\n") substr mb_substr san fran non-first - 0.5995ms - 127318-byte string (c00cabc812ac347bd2e81a3e3f04e23d) substr xmb_substr3 san fran non-first - 0.0213ms - 127318-byte string (c00cabc812ac347bd2e81a3e3f04e23d) substr mb_substr san fran middle 1k - 0.2218ms - 1025-byte string (c42eb5c511670f72ff4593a39219682c) substr xmb_substr3 san fran middle 1k - 0.3883ms - 1025-byte string (c42eb5c511670f72ff4593a39219682c) substr mb_substr boston-ja first - 0.0021ms - 1-byte string ("{") substr xmb_substr3 boston-ja first - 0.0068ms - 1-byte string ("{") substr mb_substr boston-ja last - 0.5497ms - 1-byte string ("\n") substr xmb_substr3 boston-ja last - 0.0110ms - 1-byte string ("\n") substr mb_substr boston-ja non-first - 0.4128ms - 127637-byte string (933e70d1d10f4d64cdfbd69b58592cd4) substr xmb_substr3 boston-ja non-first - 0.0216ms - 127637-byte string (933e70d1d10f4d64cdfbd69b58592cd4) substr mb_substr boston-ja middle 1k - 0.2237ms - 2006-byte string (1eaa8554ff4507109b1cba7a597d82bf) substr xmb_substr3 boston-ja middle 1k - 30.6811ms - 2006-byte string (1eaa8554ff4507109b1cba7a597d82bf) --- diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index dcf8e8d8ef..d59c7feeb4 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -33,18 +33,71 @@ if( !function_exists('iconv') ) { } } -# UTF-8 substr function based on a PHP manual comment if ( !function_exists( 'mb_substr' ) ) { - function mb_substr( $str, $start ) { - $ar = array(); - preg_match_all( '/./us', $str, $ar ); - - if( func_num_args() >= 3 ) { - $end = func_get_arg( 2 ); - return join( '', array_slice( $ar[0], $start, $end ) ); + /** + * Fallback implementation for mb_substr, hardcoded to UTF-8. + * Attempts to be at least _moderately_ efficient; best optimized + * for relatively small offset and count values -- about 5x slower + * than native mb_string in my testing. + * + * Larger offsets are still fairly efficient for Latin text, but + * can be up to 100x slower than native if the text is heavily + * multibyte and we have to slog through a few hundred kb. + */ + function mb_substr( $str, $start, $count='end' ) { + if( $start != 0 ) { + $split = mb_substr_split_unicode( $str, intval( $start ) ); + $str = substr( $str, $split ); + } + + if( $count !== 'end' ) { + $split = mb_substr_split_unicode( $str, intval( $count ) ); + $str = substr( $str, 0, $split ); + } + + return $str; + } + + function mb_substr_split_unicode( $str, $splitPos ) { + if( $splitPos == 0 ) { + return 0; + } + + $byteLen = strlen( $str ); + + if( $splitPos > 0 ) { + if( $splitPos > 256 ) { + // Optimize large string offsets by skipping ahead N bytes. + // This will cut out most of our slow time on Latin-based text, + // and 1/2 to 1/3 on East European and Asian scripts. + $bytePos = $splitPos; + while ($bytePos < $byteLen && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0") + ++$bytePos; + $charPos = mb_strlen( substr( $str, 0, $bytePos ) ); + } else { + $charPos = 0; + $bytePos = 0; + } + + while( $charPos++ < $splitPos ) { + ++$bytePos; + // Move past any tail bytes + while ($bytePos < $byteLen && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0") + ++$bytePos; + } } else { - return join( '', array_slice( $ar[0], $start ) ); + $splitPosX = $splitPos + 1; + $charPos = 0; // relative to end of string; we don't care about the actual char position here + $bytePos = $byteLen; + while( $bytePos > 0 && $charPos-- >= $splitPosX ) { + --$bytePos; + // Move past any tail bytes + while ($bytePos > 0 && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0") + --$bytePos; + } } + + return $bytePos; } }