From: Brion Vibber <brion@users.mediawiki.org>
Date: Wed, 26 Aug 2009 05:51:21 +0000 (+0000)
Subject: Replace our mb_substr() fallback implementation with one which is not quite so horrib... 
X-Git-Tag: 1.31.0-rc.0~40056
X-Git-Url: https://git.cyclocoop.org/%27.WWW_URL.%27admin/?a=commitdiff_plain;h=1505db42a26f9a3e7b3934d8d48e190e39a06924;p=lhc%2Fweb%2Fwiklou.git

Replace our mb_substr() fallback implementation with one which is not quite so horrible...

While not too awful on smallish strings, the way it worked was *murder* on large input:
the *entire string* would be broken up into an array of individual characters, sliced up,
then merged back together.

In my testing I couldn't even get the function to complete in a reasonable time for, say,
127k worth of text... not only did the regex split take forever, but it would eat an insane
amount of memory, likely triggering memory_limit hits in a sane world.

The new implementation counts characters from the beginning or end of a string to determine
the byte-based offsets to use for substr() start and count parameters, and only uses
a couple temporary dupes of the string in memory. For typical short offset/count cases
(take or trim one or a few characters) this performs about 3-5x worse than native mb_substr()
for in my testing.

Large offsets are optimized by first skipping the same number of bytes as characters, since all
characters take at least one byte. On primarily Latin text this made some of my test cases
actually *faster* than native mb_substr()! ;) For non-Latin texts this takes out a fair chunk
of our work, but can still leave us with very slow execution -- eg ~30ms to get through a few
dozens of kilobytes worth of offset on Japanese text. But at least it completes now!

This could probably be optimized further, perhaps skipping progressively smaller chunks in
binary-chop fashion. :)


For fun, my profiling results (profiling & test scripts are in a little git repo which I would
push to gitorious to poke at, but gitorious hates me right now and won't finish my repo setup):

strlen       mb_strlen short ascii - 0.0019ms - 19
strlen      xmb_strlen short ascii - 0.0672ms - 19

strlen       mb_strlen short unicode - 0.0019ms - 19
strlen      xmb_strlen short unicode - 0.0657ms - 19

strlen       mb_strlen long ascii - 0.0826ms - 20000
strlen      xmb_strlen long ascii - 0.1236ms - 20000

strlen       mb_strlen long unicode - 0.0774ms - 20000
strlen      xmb_strlen long unicode - 0.1901ms - 20000

strlen       mb_strlen san francisco - 0.4775ms - 126700
strlen      xmb_strlen san francisco - 0.4474ms - 126700


substr       mb_substr short ascii first - 0.0022ms - 1-byte string ("s") <- native
substr      xmb_substr short ascii first - 0.0168ms - 1-byte string ("s") <- old fallback
substr     xmb_substr3 short ascii first - 0.0069ms - 1-byte string ("s") <- new fallback

substr       mb_substr short ascii last - 0.0023ms - 1-byte string ("s")
substr      xmb_substr short ascii last - 0.0171ms - 1-byte string ("s")
substr     xmb_substr3 short ascii last - 0.0113ms - 1-byte string ("s")

substr       mb_substr short ascii trim last 9 - 0.0023ms - 10-byte string ("short asci")
substr      xmb_substr short ascii trim last 9 - 0.0183ms - 10-byte string ("short asci")
substr     xmb_substr3 short ascii trim last 9 - 0.0119ms - 10-byte string ("short asci")

substr       mb_substr short ascii middle 3 - 0.0022ms - 3-byte string ("sci")
substr      xmb_substr short ascii middle 3 - 0.0171ms - 3-byte string ("sci")
substr     xmb_substr3 short ascii middle 3 - 0.0149ms - 3-byte string ("sci")

substr       mb_substr short unicode first - 0.0022ms - 1-byte string ("s")
substr      xmb_substr short unicode first - 0.0184ms - 1-byte string ("s")
substr     xmb_substr3 short unicode first - 0.0071ms - 1-byte string ("s")

substr       mb_substr short unicode last - 0.0026ms - 2-byte string ("ß")
substr      xmb_substr short unicode last - 0.0187ms - 2-byte string ("ß")
substr     xmb_substr3 short unicode last - 0.0130ms - 2-byte string ("ß")

substr       mb_substr short unicode trim last 9 - 0.0024ms - 14-byte string ("short áéíó")
substr      xmb_substr short unicode trim last 9 - 0.0200ms - 14-byte string ("short áéíó")
substr     xmb_substr3 short unicode trim last 9 - 0.0137ms - 14-byte string ("short áéíó")

substr       mb_substr short unicode middle 3 - 0.0022ms - 6-byte string ("éíó")
substr      xmb_substr short unicode middle 3 - 0.0188ms - 6-byte string ("éíó")
substr     xmb_substr3 short unicode middle 3 - 0.0189ms - 6-byte string ("éíó")

substr       mb_substr san fran first - 0.0022ms - 1-byte string ("{")
substr     xmb_substr3 san fran first - 0.0069ms - 1-byte string ("{")

substr       mb_substr san fran last - 0.8914ms - 1-byte string ("\n")
substr     xmb_substr3 san fran last - 0.0109ms - 1-byte string ("\n")

substr       mb_substr san fran non-first - 0.5995ms - 127318-byte string (c00cabc812ac347bd2e81a3e3f04e23d)
substr     xmb_substr3 san fran non-first - 0.0213ms - 127318-byte string (c00cabc812ac347bd2e81a3e3f04e23d)

substr       mb_substr san fran middle 1k - 0.2218ms - 1025-byte string (c42eb5c511670f72ff4593a39219682c)
substr     xmb_substr3 san fran middle 1k - 0.3883ms - 1025-byte string (c42eb5c511670f72ff4593a39219682c)

substr       mb_substr boston-ja first - 0.0021ms - 1-byte string ("{")
substr     xmb_substr3 boston-ja first - 0.0068ms - 1-byte string ("{")

substr       mb_substr boston-ja last - 0.5497ms - 1-byte string ("\n")
substr     xmb_substr3 boston-ja last - 0.0110ms - 1-byte string ("\n")

substr       mb_substr boston-ja non-first - 0.4128ms - 127637-byte string (933e70d1d10f4d64cdfbd69b58592cd4)
substr     xmb_substr3 boston-ja non-first - 0.0216ms - 127637-byte string (933e70d1d10f4d64cdfbd69b58592cd4)

substr       mb_substr boston-ja middle 1k - 0.2237ms - 2006-byte string (1eaa8554ff4507109b1cba7a597d82bf)
substr     xmb_substr3 boston-ja middle 1k - 30.6811ms - 2006-byte string (1eaa8554ff4507109b1cba7a597d82bf)
---

diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php
index dcf8e8d8ef..d59c7feeb4 100644
--- a/includes/GlobalFunctions.php
+++ b/includes/GlobalFunctions.php
@@ -33,18 +33,71 @@ if( !function_exists('iconv') ) {
 	}
 }
 
-# UTF-8 substr function based on a PHP manual comment
 if ( !function_exists( 'mb_substr' ) ) {
-	function mb_substr( $str, $start ) {
-		$ar = array();
-		preg_match_all( '/./us', $str, $ar );
-
-		if( func_num_args() >= 3 ) {
-			$end = func_get_arg( 2 );
-			return join( '', array_slice( $ar[0], $start, $end ) );
+	/**
+	 * Fallback implementation for mb_substr, hardcoded to UTF-8.
+	 * Attempts to be at least _moderately_ efficient; best optimized
+	 * for relatively small offset and count values -- about 5x slower
+	 * than native mb_string in my testing.
+	 *
+	 * Larger offsets are still fairly efficient for Latin text, but
+	 * can be up to 100x slower than native if the text is heavily
+	 * multibyte and we have to slog through a few hundred kb.
+	 */
+	function mb_substr( $str, $start, $count='end' ) {
+		if( $start != 0 ) {
+			$split = mb_substr_split_unicode( $str, intval( $start ) );
+			$str = substr( $str, $split );
+		}
+		
+		if( $count !== 'end' ) {
+			$split = mb_substr_split_unicode( $str, intval( $count ) );
+			$str = substr( $str, 0, $split );
+		}
+		
+		return $str;
+	}
+	
+	function mb_substr_split_unicode( $str, $splitPos ) {
+		if( $splitPos == 0 ) {
+			return 0;
+		}
+		
+		$byteLen = strlen( $str );
+		
+		if( $splitPos > 0 ) {
+			if( $splitPos > 256 ) {
+				// Optimize large string offsets by skipping ahead N bytes.
+				// This will cut out most of our slow time on Latin-based text,
+				// and 1/2 to 1/3 on East European and Asian scripts.
+				$bytePos = $splitPos;
+				while ($bytePos < $byteLen && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0")
+					++$bytePos;
+				$charPos = mb_strlen( substr( $str, 0, $bytePos ) );
+			} else {
+				$charPos = 0;
+				$bytePos = 0;
+			}
+			
+			while( $charPos++ < $splitPos ) {
+				++$bytePos;
+				// Move past any tail bytes
+				while ($bytePos < $byteLen && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0")
+					++$bytePos;
+			}
 		} else {
-			return join( '', array_slice( $ar[0], $start ) );
+			$splitPosX = $splitPos + 1;
+			$charPos = 0; // relative to end of string; we don't care about the actual char position here
+			$bytePos = $byteLen;
+			while( $bytePos > 0 && $charPos-- >= $splitPosX ) {
+				--$bytePos;
+				// Move past any tail bytes
+				while ($bytePos > 0 && $str{$bytePos} >= "\x80" && $str{$bytePos} < "\xc0")
+					--$bytePos;
+			}
 		}
+		
+		return $bytePos;
 	}
 }