'StoreFileOp' => __DIR__ . '/includes/filebackend/FileOp.php',
'StreamFile' => __DIR__ . '/includes/StreamFile.php',
'StringPrefixSearch' => __DIR__ . '/includes/PrefixSearch.php',
- 'StringUtils' => __DIR__ . '/includes/utils/StringUtils.php',
+ 'StringUtils' => __DIR__ . '/includes/libs/StringUtils.php',
'StripState' => __DIR__ . '/includes/parser/StripState.php',
'StubObject' => __DIR__ . '/includes/StubObject.php',
'StubUserLang' => __DIR__ . '/includes/StubObject.php',
--- /dev/null
+<?php
+/**
+ * Methods to play with strings.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ */
+
+/**
+ * A collection of static methods to play with strings.
+ */
+class StringUtils {
+ /**
+ * Test whether a string is valid UTF-8.
+ *
+ * The function check for invalid byte sequences, overlong encoding but
+ * not for different normalisations.
+ *
+ * This relies internally on the mbstring function mb_check_encoding()
+ * hardcoded to check against UTF-8. Whenever the function is not available
+ * we fallback to a pure PHP implementation. Setting $disableMbstring to
+ * true will skip the use of mb_check_encoding, this is mostly intended for
+ * unit testing our internal implementation.
+ *
+ * @since 1.21
+ * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
+ * In particular, the pure PHP code path did not in fact check for overlong forms.
+ * Beware of this when backporting code to that version of MediaWiki.
+ *
+ * @param string $value String to check
+ * @param bool $disableMbstring Whether to use the pure PHP
+ * implementation instead of trying mb_check_encoding. Intended for unit
+ * testing. Default: false
+ *
+ * @return bool Whether the given $value is a valid UTF-8 encoded string
+ */
+ static function isUtf8( $value, $disableMbstring = false ) {
+ $value = (string)$value;
+
+ // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
+ // U+10FFFF are incorrectly allowed, so we have to check for them separately.
+ if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
+ static $newPHP;
+ if ( $newPHP === null ) {
+ $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
+ }
+
+ return mb_check_encoding( $value, 'UTF-8' ) &&
+ ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
+ }
+
+ if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
+ // String contains only ASCII characters, has to be valid
+ return true;
+ }
+
+ // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
+ // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
+ // sequences, which can be as long as the input string is. Multiple short regexes are
+ // used rather than a single long regex for performance.
+ static $regexes;
+ if ( $regexes === null ) {
+ $cont = "[\x80-\xbf]";
+ $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
+ $regexes = array(
+ // Continuation byte at the start
+ "/^$cont/",
+
+ // ASCII byte followed by a continuation byte
+ "/[\\x00-\x7f]$cont/S",
+
+ // Illegal byte
+ "/[\xc0\xc1\xf5-\xff]/S",
+
+ // Invalid 2-byte sequence, or valid one then an extra continuation byte
+ "/[\xc2-\xdf](?!$cont$after)/S",
+
+ // Invalid 3-byte sequence, or valid one then an extra continuation byte
+ "/\xe0(?![\xa0-\xbf]$cont$after)/",
+ "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
+ "/\xed(?![\x80-\x9f]$cont$after)/",
+
+ // Invalid 4-byte sequence, or valid one then an extra continuation byte
+ "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
+ "/[\xf1-\xf3](?!$cont{3}$after)/S",
+ "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
+ );
+ }
+
+ foreach ( $regexes as $regex ) {
+ if ( preg_match( $regex, $value ) !== 0 ) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Perform an operation equivalent to
+ *
+ * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
+ *
+ * except that it's worst-case O(N) instead of O(N^2)
+ *
+ * Compared to delimiterReplace(), this implementation is fast but memory-
+ * hungry and inflexible. The memory requirements are such that I don't
+ * recommend using it on anything but guaranteed small chunks of text.
+ *
+ * @param string $startDelim
+ * @param string $endDelim
+ * @param string $replace
+ * @param string $subject
+ *
+ * @return string
+ */
+ static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
+ $segments = explode( $startDelim, $subject );
+ $output = array_shift( $segments );
+ foreach ( $segments as $s ) {
+ $endDelimPos = strpos( $s, $endDelim );
+ if ( $endDelimPos === false ) {
+ $output .= $startDelim . $s;
+ } else {
+ $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
+ }
+ }
+
+ return $output;
+ }
+
+ /**
+ * Perform an operation equivalent to
+ *
+ * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
+ *
+ * This implementation is slower than hungryDelimiterReplace but uses far less
+ * memory. The delimiters are literal strings, not regular expressions.
+ *
+ * If the start delimiter ends with an initial substring of the end delimiter,
+ * e.g. in the case of C-style comments, the behavior differs from the model
+ * regex. In this implementation, the end must share no characters with the
+ * start, so e.g. /*\/ is not considered to be both the start and end of a
+ * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
+ *
+ * @param string $startDelim Start delimiter
+ * @param string $endDelim End delimiter
+ * @param callable $callback Function to call on each match
+ * @param string $subject
+ * @param string $flags Regular expression flags
+ * @throws InvalidArgumentException
+ * @return string
+ */
+ static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
+ $subject, $flags = ''
+ ) {
+ $inputPos = 0;
+ $outputPos = 0;
+ $output = '';
+ $foundStart = false;
+ $encStart = preg_quote( $startDelim, '!' );
+ $encEnd = preg_quote( $endDelim, '!' );
+ $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
+ $endLength = strlen( $endDelim );
+ $m = array();
+
+ while ( $inputPos < strlen( $subject ) &&
+ preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
+ ) {
+ $tokenOffset = $m[0][1];
+ if ( $m[1][0] != '' ) {
+ if ( $foundStart &&
+ $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
+ ) {
+ # An end match is present at the same location
+ $tokenType = 'end';
+ $tokenLength = $endLength;
+ } else {
+ $tokenType = 'start';
+ $tokenLength = strlen( $m[0][0] );
+ }
+ } elseif ( $m[2][0] != '' ) {
+ $tokenType = 'end';
+ $tokenLength = strlen( $m[0][0] );
+ } else {
+ throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
+ }
+
+ if ( $tokenType == 'start' ) {
+ # Only move the start position if we haven't already found a start
+ # This means that START START END matches outer pair
+ if ( !$foundStart ) {
+ # Found start
+ $inputPos = $tokenOffset + $tokenLength;
+ # Write out the non-matching section
+ $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
+ $outputPos = $tokenOffset;
+ $contentPos = $inputPos;
+ $foundStart = true;
+ } else {
+ # Move the input position past the *first character* of START,
+ # to protect against missing END when it overlaps with START
+ $inputPos = $tokenOffset + 1;
+ }
+ } elseif ( $tokenType == 'end' ) {
+ if ( $foundStart ) {
+ # Found match
+ $output .= call_user_func( $callback, array(
+ substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
+ substr( $subject, $contentPos, $tokenOffset - $contentPos )
+ ) );
+ $foundStart = false;
+ } else {
+ # Non-matching end, write it out
+ $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
+ }
+ $inputPos = $outputPos = $tokenOffset + $tokenLength;
+ } else {
+ throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
+ }
+ }
+ if ( $outputPos < strlen( $subject ) ) {
+ $output .= substr( $subject, $outputPos );
+ }
+
+ return $output;
+ }
+
+ /**
+ * Perform an operation equivalent to
+ *
+ * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
+ *
+ * @param string $startDelim Start delimiter regular expression
+ * @param string $endDelim End delimiter regular expression
+ * @param string $replace Replacement string. May contain $1, which will be
+ * replaced by the text between the delimiters
+ * @param string $subject String to search
+ * @param string $flags Regular expression flags
+ * @return string The string with the matches replaced
+ */
+ static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
+ $replacer = new RegexlikeReplacer( $replace );
+
+ return self::delimiterReplaceCallback( $startDelim, $endDelim,
+ $replacer->cb(), $subject, $flags );
+ }
+
+ /**
+ * More or less "markup-safe" explode()
+ * Ignores any instances of the separator inside <...>
+ * @param string $separator
+ * @param string $text
+ * @return array
+ */
+ static function explodeMarkup( $separator, $text ) {
+ $placeholder = "\x00";
+
+ // Remove placeholder instances
+ $text = str_replace( $placeholder, '', $text );
+
+ // Replace instances of the separator inside HTML-like tags with the placeholder
+ $replacer = new DoubleReplacer( $separator, $placeholder );
+ $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
+
+ // Explode, then put the replaced separators back in
+ $items = explode( $separator, $cleaned );
+ foreach ( $items as $i => $str ) {
+ $items[$i] = str_replace( $placeholder, $separator, $str );
+ }
+
+ return $items;
+ }
+
+ /**
+ * Escape a string to make it suitable for inclusion in a preg_replace()
+ * replacement parameter.
+ *
+ * @param string $string
+ * @return string
+ */
+ static function escapeRegexReplacement( $string ) {
+ $string = str_replace( '\\', '\\\\', $string );
+ $string = str_replace( '$', '\\$', $string );
+
+ return $string;
+ }
+
+ /**
+ * Workalike for explode() with limited memory usage.
+ * Returns an Iterator
+ * @param string $separator
+ * @param string $subject
+ * @return ArrayIterator|ExplodeIterator
+ */
+ static function explode( $separator, $subject ) {
+ if ( substr_count( $subject, $separator ) > 1000 ) {
+ return new ExplodeIterator( $separator, $subject );
+ } else {
+ return new ArrayIterator( explode( $separator, $subject ) );
+ }
+ }
+}
+++ /dev/null
-<?php
-/**
- * Methods to play with strings.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- */
-
-/**
- * A collection of static methods to play with strings.
- */
-class StringUtils {
- /**
- * Test whether a string is valid UTF-8.
- *
- * The function check for invalid byte sequences, overlong encoding but
- * not for different normalisations.
- *
- * This relies internally on the mbstring function mb_check_encoding()
- * hardcoded to check against UTF-8. Whenever the function is not available
- * we fallback to a pure PHP implementation. Setting $disableMbstring to
- * true will skip the use of mb_check_encoding, this is mostly intended for
- * unit testing our internal implementation.
- *
- * @since 1.21
- * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
- * In particular, the pure PHP code path did not in fact check for overlong forms.
- * Beware of this when backporting code to that version of MediaWiki.
- *
- * @param string $value String to check
- * @param bool $disableMbstring Whether to use the pure PHP
- * implementation instead of trying mb_check_encoding. Intended for unit
- * testing. Default: false
- *
- * @return bool Whether the given $value is a valid UTF-8 encoded string
- */
- static function isUtf8( $value, $disableMbstring = false ) {
- $value = (string)$value;
-
- // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
- // U+10FFFF are incorrectly allowed, so we have to check for them separately.
- if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
- static $newPHP;
- if ( $newPHP === null ) {
- $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
- }
-
- return mb_check_encoding( $value, 'UTF-8' ) &&
- ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
- }
-
- if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
- // String contains only ASCII characters, has to be valid
- return true;
- }
-
- // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
- // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
- // sequences, which can be as long as the input string is. Multiple short regexes are
- // used rather than a single long regex for performance.
- static $regexes;
- if ( $regexes === null ) {
- $cont = "[\x80-\xbf]";
- $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
- $regexes = array(
- // Continuation byte at the start
- "/^$cont/",
-
- // ASCII byte followed by a continuation byte
- "/[\\x00-\x7f]$cont/S",
-
- // Illegal byte
- "/[\xc0\xc1\xf5-\xff]/S",
-
- // Invalid 2-byte sequence, or valid one then an extra continuation byte
- "/[\xc2-\xdf](?!$cont$after)/S",
-
- // Invalid 3-byte sequence, or valid one then an extra continuation byte
- "/\xe0(?![\xa0-\xbf]$cont$after)/",
- "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
- "/\xed(?![\x80-\x9f]$cont$after)/",
-
- // Invalid 4-byte sequence, or valid one then an extra continuation byte
- "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
- "/[\xf1-\xf3](?!$cont{3}$after)/S",
- "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
- );
- }
-
- foreach ( $regexes as $regex ) {
- if ( preg_match( $regex, $value ) !== 0 ) {
- return false;
- }
- }
-
- return true;
- }
-
- /**
- * Perform an operation equivalent to
- *
- * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
- *
- * except that it's worst-case O(N) instead of O(N^2)
- *
- * Compared to delimiterReplace(), this implementation is fast but memory-
- * hungry and inflexible. The memory requirements are such that I don't
- * recommend using it on anything but guaranteed small chunks of text.
- *
- * @param string $startDelim
- * @param string $endDelim
- * @param string $replace
- * @param string $subject
- *
- * @return string
- */
- static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
- $segments = explode( $startDelim, $subject );
- $output = array_shift( $segments );
- foreach ( $segments as $s ) {
- $endDelimPos = strpos( $s, $endDelim );
- if ( $endDelimPos === false ) {
- $output .= $startDelim . $s;
- } else {
- $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
- }
- }
-
- return $output;
- }
-
- /**
- * Perform an operation equivalent to
- *
- * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
- *
- * This implementation is slower than hungryDelimiterReplace but uses far less
- * memory. The delimiters are literal strings, not regular expressions.
- *
- * If the start delimiter ends with an initial substring of the end delimiter,
- * e.g. in the case of C-style comments, the behavior differs from the model
- * regex. In this implementation, the end must share no characters with the
- * start, so e.g. /*\/ is not considered to be both the start and end of a
- * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
- *
- * @param string $startDelim Start delimiter
- * @param string $endDelim End delimiter
- * @param callable $callback Function to call on each match
- * @param string $subject
- * @param string $flags Regular expression flags
- * @throws MWException
- * @return string
- */
- static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
- $subject, $flags = ''
- ) {
- $inputPos = 0;
- $outputPos = 0;
- $output = '';
- $foundStart = false;
- $encStart = preg_quote( $startDelim, '!' );
- $encEnd = preg_quote( $endDelim, '!' );
- $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
- $endLength = strlen( $endDelim );
- $m = array();
-
- while ( $inputPos < strlen( $subject ) &&
- preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
- ) {
- $tokenOffset = $m[0][1];
- if ( $m[1][0] != '' ) {
- if ( $foundStart &&
- $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
- ) {
- # An end match is present at the same location
- $tokenType = 'end';
- $tokenLength = $endLength;
- } else {
- $tokenType = 'start';
- $tokenLength = strlen( $m[0][0] );
- }
- } elseif ( $m[2][0] != '' ) {
- $tokenType = 'end';
- $tokenLength = strlen( $m[0][0] );
- } else {
- throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
- }
-
- if ( $tokenType == 'start' ) {
- # Only move the start position if we haven't already found a start
- # This means that START START END matches outer pair
- if ( !$foundStart ) {
- # Found start
- $inputPos = $tokenOffset + $tokenLength;
- # Write out the non-matching section
- $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
- $outputPos = $tokenOffset;
- $contentPos = $inputPos;
- $foundStart = true;
- } else {
- # Move the input position past the *first character* of START,
- # to protect against missing END when it overlaps with START
- $inputPos = $tokenOffset + 1;
- }
- } elseif ( $tokenType == 'end' ) {
- if ( $foundStart ) {
- # Found match
- $output .= call_user_func( $callback, array(
- substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
- substr( $subject, $contentPos, $tokenOffset - $contentPos )
- ) );
- $foundStart = false;
- } else {
- # Non-matching end, write it out
- $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
- }
- $inputPos = $outputPos = $tokenOffset + $tokenLength;
- } else {
- throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
- }
- }
- if ( $outputPos < strlen( $subject ) ) {
- $output .= substr( $subject, $outputPos );
- }
-
- return $output;
- }
-
- /**
- * Perform an operation equivalent to
- *
- * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
- *
- * @param string $startDelim Start delimiter regular expression
- * @param string $endDelim End delimiter regular expression
- * @param string $replace Replacement string. May contain $1, which will be
- * replaced by the text between the delimiters
- * @param string $subject String to search
- * @param string $flags Regular expression flags
- * @return string The string with the matches replaced
- */
- static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
- $replacer = new RegexlikeReplacer( $replace );
-
- return self::delimiterReplaceCallback( $startDelim, $endDelim,
- $replacer->cb(), $subject, $flags );
- }
-
- /**
- * More or less "markup-safe" explode()
- * Ignores any instances of the separator inside <...>
- * @param string $separator
- * @param string $text
- * @return array
- */
- static function explodeMarkup( $separator, $text ) {
- $placeholder = "\x00";
-
- // Remove placeholder instances
- $text = str_replace( $placeholder, '', $text );
-
- // Replace instances of the separator inside HTML-like tags with the placeholder
- $replacer = new DoubleReplacer( $separator, $placeholder );
- $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
-
- // Explode, then put the replaced separators back in
- $items = explode( $separator, $cleaned );
- foreach ( $items as $i => $str ) {
- $items[$i] = str_replace( $placeholder, $separator, $str );
- }
-
- return $items;
- }
-
- /**
- * Escape a string to make it suitable for inclusion in a preg_replace()
- * replacement parameter.
- *
- * @param string $string
- * @return string
- */
- static function escapeRegexReplacement( $string ) {
- $string = str_replace( '\\', '\\\\', $string );
- $string = str_replace( '$', '\\$', $string );
-
- return $string;
- }
-
- /**
- * Workalike for explode() with limited memory usage.
- * Returns an Iterator
- * @param string $separator
- * @param string $subject
- * @return ArrayIterator|ExplodeIterator
- */
- static function explode( $separator, $subject ) {
- if ( substr_count( $subject, $separator ) > 1000 ) {
- return new ExplodeIterator( $separator, $subject );
- } else {
- return new ArrayIterator( explode( $separator, $subject ) );
- }
- }
-}
--- /dev/null
+<?php
+
+class StringUtilsTest extends PHPUnit_Framework_TestCase {
+
+ /**
+ * This tests StringUtils::isUtf8 whenever we have the mbstring extension
+ * loaded.
+ *
+ * @covers StringUtils::isUtf8
+ * @dataProvider provideStringsForIsUtf8Check
+ */
+ public function testIsUtf8WithMbstring( $expected, $string ) {
+ if ( !function_exists( 'mb_check_encoding' ) ) {
+ $this->markTestSkipped( 'Test requires the mbstring PHP extension' );
+ }
+ $this->assertEquals( $expected,
+ StringUtils::isUtf8( $string ),
+ 'Testing string "' . $this->escaped( $string ) . '" with mb_check_encoding'
+ );
+ }
+
+ /**
+ * This tests StringUtils::isUtf8 making sure we use the pure PHP
+ * implementation used as a fallback when mb_check_encoding() is
+ * not available.
+ *
+ * @covers StringUtils::isUtf8
+ * @dataProvider provideStringsForIsUtf8Check
+ */
+ public function testIsUtf8WithPhpFallbackImplementation( $expected, $string ) {
+ $this->assertEquals( $expected,
+ StringUtils::isUtf8( $string, /** disable mbstring: */true ),
+ 'Testing string "' . $this->escaped( $string ) . '" with pure PHP implementation'
+ );
+ }
+
+ /**
+ * Print high range characters as a hexadecimal
+ * @param string $string
+ * @return string
+ */
+ function escaped( $string ) {
+ $escaped = '';
+ $length = strlen( $string );
+ for ( $i = 0; $i < $length; $i++ ) {
+ $char = $string[$i];
+ $val = ord( $char );
+ if ( $val > 127 ) {
+ $escaped .= '\x' . dechex( $val );
+ } else {
+ $escaped .= $char;
+ }
+ }
+
+ return $escaped;
+ }
+
+ /**
+ * See also "UTF-8 decoder capability and stress test" by
+ * Markus Kuhn:
+ * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+ */
+ public static function provideStringsForIsUtf8Check() {
+ // Expected return values for StringUtils::isUtf8()
+ $PASS = true;
+ $FAIL = false;
+
+ return array(
+ 'some ASCII' => array( $PASS, 'Some ASCII' ),
+ 'euro sign' => array( $PASS, "Euro sign €" ),
+
+ 'first possible sequence 1 byte' => array( $PASS, "\x00" ),
+ 'first possible sequence 2 bytes' => array( $PASS, "\xc2\x80" ),
+ 'first possible sequence 3 bytes' => array( $PASS, "\xe0\xa0\x80" ),
+ 'first possible sequence 4 bytes' => array( $PASS, "\xf0\x90\x80\x80" ),
+ 'first possible sequence 5 bytes' => array( $FAIL, "\xf8\x88\x80\x80\x80" ),
+ 'first possible sequence 6 bytes' => array( $FAIL, "\xfc\x84\x80\x80\x80\x80" ),
+
+ 'last possible sequence 1 byte' => array( $PASS, "\x7f" ),
+ 'last possible sequence 2 bytes' => array( $PASS, "\xdf\xbf" ),
+ 'last possible sequence 3 bytes' => array( $PASS, "\xef\xbf\xbf" ),
+ 'last possible sequence 4 bytes (U+1FFFFF)' => array( $FAIL, "\xf7\xbf\xbf\xbf" ),
+ 'last possible sequence 5 bytes' => array( $FAIL, "\xfb\xbf\xbf\xbf\xbf" ),
+ 'last possible sequence 6 bytes' => array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ),
+
+ 'boundary 1' => array( $PASS, "\xed\x9f\xbf" ),
+ 'boundary 2' => array( $PASS, "\xee\x80\x80" ),
+ 'boundary 3' => array( $PASS, "\xef\xbf\xbd" ),
+ 'boundary 4' => array( $PASS, "\xf2\x80\x80\x80" ),
+ 'boundary 5 (U+FFFFF)' => array( $PASS, "\xf3\xbf\xbf\xbf" ),
+ 'boundary 6 (U+100000)' => array( $PASS, "\xf4\x80\x80\x80" ),
+ 'boundary 7 (U+10FFFF)' => array( $PASS, "\xf4\x8f\xbf\xbf" ),
+ 'boundary 8 (U+110000)' => array( $FAIL, "\xf4\x90\x80\x80" ),
+
+ 'malformed 1' => array( $FAIL, "\x80" ),
+ 'malformed 2' => array( $FAIL, "\xbf" ),
+ 'malformed 3' => array( $FAIL, "\x80\xbf" ),
+ 'malformed 4' => array( $FAIL, "\x80\xbf\x80" ),
+ 'malformed 5' => array( $FAIL, "\x80\xbf\x80\xbf" ),
+ 'malformed 6' => array( $FAIL, "\x80\xbf\x80\xbf\x80" ),
+ 'malformed 7' => array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ),
+ 'malformed 8' => array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ),
+
+ 'last byte missing 1' => array( $FAIL, "\xc0" ),
+ 'last byte missing 2' => array( $FAIL, "\xe0\x80" ),
+ 'last byte missing 3' => array( $FAIL, "\xf0\x80\x80" ),
+ 'last byte missing 4' => array( $FAIL, "\xf8\x80\x80\x80" ),
+ 'last byte missing 5' => array( $FAIL, "\xfc\x80\x80\x80\x80" ),
+ 'last byte missing 6' => array( $FAIL, "\xdf" ),
+ 'last byte missing 7' => array( $FAIL, "\xef\xbf" ),
+ 'last byte missing 8' => array( $FAIL, "\xf7\xbf\xbf" ),
+ 'last byte missing 9' => array( $FAIL, "\xfb\xbf\xbf\xbf" ),
+ 'last byte missing 10' => array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ),
+
+ 'extra continuation byte 1' => array( $FAIL, "e\xaf" ),
+ 'extra continuation byte 2' => array( $FAIL, "\xc3\x89\xaf" ),
+ 'extra continuation byte 3' => array( $FAIL, "\xef\xbc\xa5\xaf" ),
+ 'extra continuation byte 4' => array( $FAIL, "\xf0\x9d\x99\xb4\xaf" ),
+
+ 'impossible bytes 1' => array( $FAIL, "\xfe" ),
+ 'impossible bytes 2' => array( $FAIL, "\xff" ),
+ 'impossible bytes 3' => array( $FAIL, "\xfe\xfe\xff\xff" ),
+
+ 'overlong sequences 1' => array( $FAIL, "\xc0\xaf" ),
+ 'overlong sequences 2' => array( $FAIL, "\xc1\xaf" ),
+ 'overlong sequences 3' => array( $FAIL, "\xe0\x80\xaf" ),
+ 'overlong sequences 4' => array( $FAIL, "\xf0\x80\x80\xaf" ),
+ 'overlong sequences 5' => array( $FAIL, "\xf8\x80\x80\x80\xaf" ),
+ 'overlong sequences 6' => array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ),
+
+ 'maximum overlong sequences 1' => array( $FAIL, "\xc1\xbf" ),
+ 'maximum overlong sequences 2' => array( $FAIL, "\xe0\x9f\xbf" ),
+ 'maximum overlong sequences 3' => array( $FAIL, "\xf0\x8f\xbf\xbf" ),
+ 'maximum overlong sequences 4' => array( $FAIL, "\xf8\x87\xbf\xbf" ),
+ 'maximum overlong sequences 5' => array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ),
+
+ 'surrogates 1 (U+D799)' => array( $PASS, "\xed\x9f\xbf" ),
+ 'surrogates 2 (U+E000)' => array( $PASS, "\xee\x80\x80" ),
+ 'surrogates 3 (U+D800)' => array( $FAIL, "\xed\xa0\x80" ),
+ 'surrogates 4 (U+DBFF)' => array( $FAIL, "\xed\xaf\xbf" ),
+ 'surrogates 5 (U+DC00)' => array( $FAIL, "\xed\xb0\x80" ),
+ 'surrogates 6 (U+DFFF)' => array( $FAIL, "\xed\xbf\xbf" ),
+ 'surrogates 7 (U+D800 U+DC00)' => array( $FAIL, "\xed\xa0\x80\xed\xb0\x80" ),
+
+ 'noncharacters 1' => array( $PASS, "\xef\xbf\xbe" ),
+ 'noncharacters 2' => array( $PASS, "\xef\xbf\xbf" ),
+ );
+ }
+}
+++ /dev/null
-<?php
-
-class StringUtilsTest extends PHPUnit_Framework_TestCase {
-
- /**
- * This tests StringUtils::isUtf8 whenever we have the mbstring extension
- * loaded.
- *
- * @covers StringUtils::isUtf8
- * @dataProvider provideStringsForIsUtf8Check
- */
- public function testIsUtf8WithMbstring( $expected, $string ) {
- if ( !function_exists( 'mb_check_encoding' ) ) {
- $this->markTestSkipped( 'Test requires the mbstring PHP extension' );
- }
- $this->assertEquals( $expected,
- StringUtils::isUtf8( $string ),
- 'Testing string "' . $this->escaped( $string ) . '" with mb_check_encoding'
- );
- }
-
- /**
- * This tests StringUtils::isUtf8 making sure we use the pure PHP
- * implementation used as a fallback when mb_check_encoding() is
- * not available.
- *
- * @covers StringUtils::isUtf8
- * @dataProvider provideStringsForIsUtf8Check
- */
- public function testIsUtf8WithPhpFallbackImplementation( $expected, $string ) {
- $this->assertEquals( $expected,
- StringUtils::isUtf8( $string, /** disable mbstring: */true ),
- 'Testing string "' . $this->escaped( $string ) . '" with pure PHP implementation'
- );
- }
-
- /**
- * Print high range characters as a hexadecimal
- * @param string $string
- * @return string
- */
- function escaped( $string ) {
- $escaped = '';
- $length = strlen( $string );
- for ( $i = 0; $i < $length; $i++ ) {
- $char = $string[$i];
- $val = ord( $char );
- if ( $val > 127 ) {
- $escaped .= '\x' . dechex( $val );
- } else {
- $escaped .= $char;
- }
- }
-
- return $escaped;
- }
-
- /**
- * See also "UTF-8 decoder capability and stress test" by
- * Markus Kuhn:
- * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
- */
- public static function provideStringsForIsUtf8Check() {
- // Expected return values for StringUtils::isUtf8()
- $PASS = true;
- $FAIL = false;
-
- return array(
- 'some ASCII' => array( $PASS, 'Some ASCII' ),
- 'euro sign' => array( $PASS, "Euro sign €" ),
-
- 'first possible sequence 1 byte' => array( $PASS, "\x00" ),
- 'first possible sequence 2 bytes' => array( $PASS, "\xc2\x80" ),
- 'first possible sequence 3 bytes' => array( $PASS, "\xe0\xa0\x80" ),
- 'first possible sequence 4 bytes' => array( $PASS, "\xf0\x90\x80\x80" ),
- 'first possible sequence 5 bytes' => array( $FAIL, "\xf8\x88\x80\x80\x80" ),
- 'first possible sequence 6 bytes' => array( $FAIL, "\xfc\x84\x80\x80\x80\x80" ),
-
- 'last possible sequence 1 byte' => array( $PASS, "\x7f" ),
- 'last possible sequence 2 bytes' => array( $PASS, "\xdf\xbf" ),
- 'last possible sequence 3 bytes' => array( $PASS, "\xef\xbf\xbf" ),
- 'last possible sequence 4 bytes (U+1FFFFF)' => array( $FAIL, "\xf7\xbf\xbf\xbf" ),
- 'last possible sequence 5 bytes' => array( $FAIL, "\xfb\xbf\xbf\xbf\xbf" ),
- 'last possible sequence 6 bytes' => array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ),
-
- 'boundary 1' => array( $PASS, "\xed\x9f\xbf" ),
- 'boundary 2' => array( $PASS, "\xee\x80\x80" ),
- 'boundary 3' => array( $PASS, "\xef\xbf\xbd" ),
- 'boundary 4' => array( $PASS, "\xf2\x80\x80\x80" ),
- 'boundary 5 (U+FFFFF)' => array( $PASS, "\xf3\xbf\xbf\xbf" ),
- 'boundary 6 (U+100000)' => array( $PASS, "\xf4\x80\x80\x80" ),
- 'boundary 7 (U+10FFFF)' => array( $PASS, "\xf4\x8f\xbf\xbf" ),
- 'boundary 8 (U+110000)' => array( $FAIL, "\xf4\x90\x80\x80" ),
-
- 'malformed 1' => array( $FAIL, "\x80" ),
- 'malformed 2' => array( $FAIL, "\xbf" ),
- 'malformed 3' => array( $FAIL, "\x80\xbf" ),
- 'malformed 4' => array( $FAIL, "\x80\xbf\x80" ),
- 'malformed 5' => array( $FAIL, "\x80\xbf\x80\xbf" ),
- 'malformed 6' => array( $FAIL, "\x80\xbf\x80\xbf\x80" ),
- 'malformed 7' => array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ),
- 'malformed 8' => array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ),
-
- 'last byte missing 1' => array( $FAIL, "\xc0" ),
- 'last byte missing 2' => array( $FAIL, "\xe0\x80" ),
- 'last byte missing 3' => array( $FAIL, "\xf0\x80\x80" ),
- 'last byte missing 4' => array( $FAIL, "\xf8\x80\x80\x80" ),
- 'last byte missing 5' => array( $FAIL, "\xfc\x80\x80\x80\x80" ),
- 'last byte missing 6' => array( $FAIL, "\xdf" ),
- 'last byte missing 7' => array( $FAIL, "\xef\xbf" ),
- 'last byte missing 8' => array( $FAIL, "\xf7\xbf\xbf" ),
- 'last byte missing 9' => array( $FAIL, "\xfb\xbf\xbf\xbf" ),
- 'last byte missing 10' => array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ),
-
- 'extra continuation byte 1' => array( $FAIL, "e\xaf" ),
- 'extra continuation byte 2' => array( $FAIL, "\xc3\x89\xaf" ),
- 'extra continuation byte 3' => array( $FAIL, "\xef\xbc\xa5\xaf" ),
- 'extra continuation byte 4' => array( $FAIL, "\xf0\x9d\x99\xb4\xaf" ),
-
- 'impossible bytes 1' => array( $FAIL, "\xfe" ),
- 'impossible bytes 2' => array( $FAIL, "\xff" ),
- 'impossible bytes 3' => array( $FAIL, "\xfe\xfe\xff\xff" ),
-
- 'overlong sequences 1' => array( $FAIL, "\xc0\xaf" ),
- 'overlong sequences 2' => array( $FAIL, "\xc1\xaf" ),
- 'overlong sequences 3' => array( $FAIL, "\xe0\x80\xaf" ),
- 'overlong sequences 4' => array( $FAIL, "\xf0\x80\x80\xaf" ),
- 'overlong sequences 5' => array( $FAIL, "\xf8\x80\x80\x80\xaf" ),
- 'overlong sequences 6' => array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ),
-
- 'maximum overlong sequences 1' => array( $FAIL, "\xc1\xbf" ),
- 'maximum overlong sequences 2' => array( $FAIL, "\xe0\x9f\xbf" ),
- 'maximum overlong sequences 3' => array( $FAIL, "\xf0\x8f\xbf\xbf" ),
- 'maximum overlong sequences 4' => array( $FAIL, "\xf8\x87\xbf\xbf" ),
- 'maximum overlong sequences 5' => array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ),
-
- 'surrogates 1 (U+D799)' => array( $PASS, "\xed\x9f\xbf" ),
- 'surrogates 2 (U+E000)' => array( $PASS, "\xee\x80\x80" ),
- 'surrogates 3 (U+D800)' => array( $FAIL, "\xed\xa0\x80" ),
- 'surrogates 4 (U+DBFF)' => array( $FAIL, "\xed\xaf\xbf" ),
- 'surrogates 5 (U+DC00)' => array( $FAIL, "\xed\xb0\x80" ),
- 'surrogates 6 (U+DFFF)' => array( $FAIL, "\xed\xbf\xbf" ),
- 'surrogates 7 (U+D800 U+DC00)' => array( $FAIL, "\xed\xa0\x80\xed\xb0\x80" ),
-
- 'noncharacters 1' => array( $PASS, "\xef\xbf\xbe" ),
- 'noncharacters 2' => array( $PASS, "\xef\xbf\xbf" ),
- );
- }
-}