From 5b9731e08ad1b373ebc329a7cf99de142a334181 Mon Sep 17 00:00:00 2001 From: lupo Date: Thu, 31 May 2012 14:21:36 +0200 Subject: [PATCH] (bug 36839) Use mb_check_encoding() if available. This is not a real fix for the cause of the bug (which is a pcre.recursion_limit that is far too low), but I do wonder about the efficiency of using a regexp to test for valid UTF-8 encoding. After all the regexp has to be compiled first into a state machine. Patch set 2: Php unit test for Language.checkTitleEncoding Patch set 3: benchmark Patch set 4: add benchmark for non-capturing subgroup in regexp, and since that's faster than a capturing subgroup, use it in checkTitleEncoding() in the regexp branch. Patch set 5: use Tim's suggestion (once-only pattern) in the regexp branch. Also add to benchmark. Change-Id: I551f096921d4c9c57cbcb091b80ab5970ca86a9b --- languages/Language.php | 8 +- .../benchmarks/bench_utf8_title_check.php | 107 ++++++++++++++++++ tests/phpunit/languages/LanguageTest.php | 64 +++++++++++ 3 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 maintenance/benchmarks/bench_utf8_title_check.php diff --git a/languages/Language.php b/languages/Language.php index 9f00d040b6..fd156a41ea 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -2396,8 +2396,12 @@ class Language { return $s; } - $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); + if ( function_exists( 'mb_check_encoding' ) ) { + $isutf8 = mb_check_encoding( $s, 'UTF-8' ); + } else { + $isutf8 = preg_match( '/^(?>[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); + } if ( $isutf8 ) { return $s; } diff --git a/maintenance/benchmarks/bench_utf8_title_check.php b/maintenance/benchmarks/bench_utf8_title_check.php new file mode 100644 index 0000000000..2e4b90292d --- /dev/null +++ b/maintenance/benchmarks/bench_utf8_title_check.php @@ -0,0 +1,107 @@ +checkTitleEncoding() and compares its execution time + * against that of mb_check_encoding, if available. + */ +class bench_utf8_title_check extends Benchmarker { + + private $canRun; + + private $data; + + public function __construct() { + parent::__construct(); + + $this->data = array ( + "", + "United States of America", // 7bit ASCII + "S%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e", + "Acteur%7CAlbert%20Robbins%7CAnglais%7CAnn%20Donahue%7CAnthony%20E.%20Zuiker%7CCarol%20Mendelsohn", + // This comes from bug 36839 + "Acteur%7CAlbert%20Robbins%7CAnglais%7CAnn%20Donahue%7CAnthony%20E.%20Zuiker%7CCarol%20Mendelsohn%7C" + . "Catherine%20Willows%7CDavid%20Hodges%7CDavid%20Phillips%7CGil%20Grissom%7CGreg%20Sanders%7CHodges%7C" + . "Internet%20Movie%20Database%7CJim%20Brass%7CLady%20Heather%7C" + . "Les%20Experts%20(s%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e)%7CLes%20Experts%20:%20Manhattan%7C" + . "Les%20Experts%20:%20Miami%7CListe%20des%20personnages%20des%20Experts%7C" + . "Liste%20des%20%C3%A9pisodes%20des%20Experts%7CMod%C3%A8le%20discussion:Palette%20Les%20Experts%7C" + . "Nick%20Stokes%7CPersonnage%20de%20fiction%7CPersonnage%20fictif%7CPersonnage%20de%20fiction%7C" + . "Personnages%20r%C3%A9currents%20dans%20Les%20Experts%7CRaymond%20Langston%7CRiley%20Adams%7C" + . "Saison%201%20des%20Experts%7CSaison%2010%20des%20Experts%7CSaison%2011%20des%20Experts%7C" + . "Saison%2012%20des%20Experts%7CSaison%202%20des%20Experts%7CSaison%203%20des%20Experts%7C" + . "Saison%204%20des%20Experts%7CSaison%205%20des%20Experts%7CSaison%206%20des%20Experts%7C" + . "Saison%207%20des%20Experts%7CSaison%208%20des%20Experts%7CSaison%209%20des%20Experts%7C" + . "Sara%20Sidle%7CSofia%20Curtis%7CS%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e%7CWallace%20Langham%7C" + . "Warrick%20Brown%7CWendy%20Simms%7C%C3%89tats-Unis" + ); + + $this->canRun = function_exists ( 'mb_check_encoding' ); + + if ( $this->canRun ) { + $this->mDescription = "Benchmark for using a regexp vs. mb_check_encoding to check for UTF-8 encoding."; + mb_internal_encoding( 'UTF-8' ); + } else { + $this->mDescription = "CANNOT RUN benchmark using mb_check_encoding: function not available."; + } + } + + public function execute() { + if ( !$this->canRun ) { + return; + } + $benchmarks = array(); + foreach ($this->data as $val) { + $benchmarks[] = array( + 'function' => array( $this, 'use_regexp' ), + 'args' => array( rawurldecode ( $val ) ) + ); + $benchmarks[] = array( + 'function' => array( $this, 'use_regexp_non_capturing' ), + 'args' => array( rawurldecode ( $val ) ) + ); + $benchmarks[] = array( + 'function' => array( $this, 'use_regexp_once_only' ), + 'args' => array( rawurldecode ( $val ) ) + ); + $benchmarks[] = array( + 'function' => array( $this, 'use_mb_check_encoding' ), + 'args' => array( rawurldecode ( $val ) ) + ); + } + $this->bench( $benchmarks ); + print $this->getFormattedResults(); + } + + private $isutf8; + + function use_regexp( $s ) { + $this->isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); + } + + function use_regexp_non_capturing( $s ) { + // Same as above with a non-capturing subgroup. + $this->isutf8 = preg_match( '/^(?:[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); + } + + function use_regexp_once_only( $s ) { + // Same as above with a once-only subgroup. + $this->isutf8 = preg_match( '/^(?>[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); + } + + function use_mb_check_encoding( $s ) { + $this->isutf8 = mb_check_encoding( $s, 'UTF-8' ); + } + +} + +$maintClass = 'bench_utf8_title_check'; +require_once( RUN_MAINTENANCE_IF_MAIN ); diff --git a/tests/phpunit/languages/LanguageTest.php b/tests/phpunit/languages/LanguageTest.php index 42396708d3..a1a6d59294 100644 --- a/tests/phpunit/languages/LanguageTest.php +++ b/tests/phpunit/languages/LanguageTest.php @@ -791,4 +791,68 @@ class LanguageTest extends MediaWikiTestCase { ), ); } + + /** + * @dataProvider provideCheckTitleEncodingData + */ + function testCheckTitleEncoding( $s ) { + $this->assertEquals( + $s, + $this->lang->checkTitleEncoding($s), + "checkTitleEncoding('$s')" + ); + } + + function provideCheckTitleEncodingData() { + return array ( + array( "" ), + array( "United States of America" ), // 7bit ASCII + array( rawurldecode( "S%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e" ) ), + array( + rawurldecode( + "Acteur%7CAlbert%20Robbins%7CAnglais%7CAnn%20Donahue%7CAnthony%20E.%20Zuiker%7CCarol%20Mendelsohn" + ) + ), + // The following two data sets come from bug 36839. They fail if checkTitleEncoding uses a regexp to test for + // valid UTF-8 encoding and the pcre.recursion_limit is low (like, say, 1024). They succeed if checkTitleEncoding + // uses mb_check_encoding for its test. + array( + rawurldecode( + "Acteur%7CAlbert%20Robbins%7CAnglais%7CAnn%20Donahue%7CAnthony%20E.%20Zuiker%7CCarol%20Mendelsohn%7C" + . "Catherine%20Willows%7CDavid%20Hodges%7CDavid%20Phillips%7CGil%20Grissom%7CGreg%20Sanders%7CHodges%7C" + . "Internet%20Movie%20Database%7CJim%20Brass%7CLady%20Heather%7C" + . "Les%20Experts%20(s%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e)%7CLes%20Experts%20:%20Manhattan%7C" + . "Les%20Experts%20:%20Miami%7CListe%20des%20personnages%20des%20Experts%7C" + . "Liste%20des%20%C3%A9pisodes%20des%20Experts%7CMod%C3%A8le%20discussion:Palette%20Les%20Experts%7C" + . "Nick%20Stokes%7CPersonnage%20de%20fiction%7CPersonnage%20fictif%7CPersonnage%20de%20fiction%7C" + . "Personnages%20r%C3%A9currents%20dans%20Les%20Experts%7CRaymond%20Langston%7CRiley%20Adams%7C" + . "Saison%201%20des%20Experts%7CSaison%2010%20des%20Experts%7CSaison%2011%20des%20Experts%7C" + . "Saison%2012%20des%20Experts%7CSaison%202%20des%20Experts%7CSaison%203%20des%20Experts%7C" + . "Saison%204%20des%20Experts%7CSaison%205%20des%20Experts%7CSaison%206%20des%20Experts%7C" + . "Saison%207%20des%20Experts%7CSaison%208%20des%20Experts%7CSaison%209%20des%20Experts%7C" + . "Sara%20Sidle%7CSofia%20Curtis%7CS%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e%7CWallace%20Langham%7C" + . "Warrick%20Brown%7CWendy%20Simms%7C%C3%89tats-Unis" + ), + ), + array( + rawurldecode( + "Mod%C3%A8le%3AArrondissements%20homonymes%7CMod%C3%A8le%3ABandeau%20standard%20pour%20page%20d'homonymie%7C" + . "Mod%C3%A8le%3ABatailles%20homonymes%7CMod%C3%A8le%3ACantons%20homonymes%7C" + . "Mod%C3%A8le%3ACommunes%20fran%C3%A7aises%20homonymes%7CMod%C3%A8le%3AFilms%20homonymes%7C" + . "Mod%C3%A8le%3AGouvernements%20homonymes%7CMod%C3%A8le%3AGuerres%20homonymes%7CMod%C3%A8le%3AHomonymie%7C" + . "Mod%C3%A8le%3AHomonymie%20bateau%7CMod%C3%A8le%3AHomonymie%20d'%C3%A9tablissements%20scolaires%20ou" + . "%20universitaires%7CMod%C3%A8le%3AHomonymie%20d'%C3%AEles%7CMod%C3%A8le%3AHomonymie%20de%20clubs%20sportifs%7C" + . "Mod%C3%A8le%3AHomonymie%20de%20comt%C3%A9s%7CMod%C3%A8le%3AHomonymie%20de%20monument%7C" + . "Mod%C3%A8le%3AHomonymie%20de%20nom%20romain%7CMod%C3%A8le%3AHomonymie%20de%20parti%20politique%7C" + . "Mod%C3%A8le%3AHomonymie%20de%20route%7CMod%C3%A8le%3AHomonymie%20dynastique%7C" + . "Mod%C3%A8le%3AHomonymie%20vid%C3%A9oludique%7CMod%C3%A8le%3AHomonymie%20%C3%A9difice%20religieux%7C" + . "Mod%C3%A8le%3AInternationalisation%7CMod%C3%A8le%3AIsom%C3%A9rie%7CMod%C3%A8le%3AParonymie%7C" + . "Mod%C3%A8le%3APatronyme%7CMod%C3%A8le%3APatronyme%20basque%7CMod%C3%A8le%3APatronyme%20italien%7C" + . "Mod%C3%A8le%3APatronymie%7CMod%C3%A8le%3APersonnes%20homonymes%7CMod%C3%A8le%3ASaints%20homonymes%7C" + . "Mod%C3%A8le%3ATitres%20homonymes%7CMod%C3%A8le%3AToponymie%7CMod%C3%A8le%3AUnit%C3%A9s%20homonymes%7C" + . "Mod%C3%A8le%3AVilles%20homonymes%7CMod%C3%A8le%3A%C3%89difices%20religieux%20homonymes" + ) + ) + ); + } } -- 2.20.1