(bug 36839) Use mb_check_encoding() if available.
authorlupo <lupo.bugzilla@gmail.com>
Thu, 31 May 2012 12:21:36 +0000 (14:21 +0200)
committerlupo <lupo.bugzilla@gmail.com>
Thu, 31 May 2012 12:21:36 +0000 (14:21 +0200)
This is not a real fix for the cause of the bug (which is a
pcre.recursion_limit that is far too low), but I do wonder
about the efficiency of using a regexp to test for valid
UTF-8 encoding. After all the regexp has to be compiled first
into a state machine.

Patch set 2: Php unit test for Language.checkTitleEncoding
Patch set 3: benchmark
Patch set 4: add benchmark for non-capturing subgroup in regexp, and
since that's faster than a capturing subgroup, use it in
checkTitleEncoding() in the regexp branch.
Patch set 5: use Tim's suggestion (once-only pattern) in the regexp
branch. Also add to benchmark.

Change-Id: I551f096921d4c9c57cbcb091b80ab5970ca86a9b

languages/Language.php
maintenance/benchmarks/bench_utf8_title_check.php [new file with mode: 0644]
tests/phpunit/languages/LanguageTest.php

index 9f00d04..fd156a4 100644 (file)
@@ -2396,8 +2396,12 @@ class Language {
                        return $s;
                }
 
-               $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
-                               '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
+               if ( function_exists( 'mb_check_encoding' ) ) {
+                       $isutf8 = mb_check_encoding( $s, 'UTF-8' );
+               } else {
+                       $isutf8 = preg_match( '/^(?>[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
+                                       '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
+               }
                if ( $isutf8 ) {
                        return $s;
                }
diff --git a/maintenance/benchmarks/bench_utf8_title_check.php b/maintenance/benchmarks/bench_utf8_title_check.php
new file mode 100644 (file)
index 0000000..2e4b902
--- /dev/null
@@ -0,0 +1,107 @@
+<?php
+/**\r
+ * @file\r
+ * @ingroup Benchmark\r
+ */\r
+\r
+require_once( dirname( __FILE__ ) . '/Benchmarker.php' );\r
+
+/**
+ * This little benchmark executes the regexp used in Language->checkTitleEncoding() and compares its execution time
+ * against that of mb_check_encoding, if available.
+ */\r
+class bench_utf8_title_check extends Benchmarker {
+
+       private $canRun;
+
+       private $data;
+
+       public function __construct() {\r
+               parent::__construct();
+
+               $this->data = array (
+                       "",
+                       "United States of America", // 7bit ASCII
+                       "S%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e",
+                       "Acteur%7CAlbert%20Robbins%7CAnglais%7CAnn%20Donahue%7CAnthony%20E.%20Zuiker%7CCarol%20Mendelsohn",
+                       // This comes from bug 36839
+                       "Acteur%7CAlbert%20Robbins%7CAnglais%7CAnn%20Donahue%7CAnthony%20E.%20Zuiker%7CCarol%20Mendelsohn%7C"
+                       . "Catherine%20Willows%7CDavid%20Hodges%7CDavid%20Phillips%7CGil%20Grissom%7CGreg%20Sanders%7CHodges%7C"
+                       . "Internet%20Movie%20Database%7CJim%20Brass%7CLady%20Heather%7C"
+                       . "Les%20Experts%20(s%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e)%7CLes%20Experts%20:%20Manhattan%7C"
+                       . "Les%20Experts%20:%20Miami%7CListe%20des%20personnages%20des%20Experts%7C"
+                       . "Liste%20des%20%C3%A9pisodes%20des%20Experts%7CMod%C3%A8le%20discussion:Palette%20Les%20Experts%7C"
+                       . "Nick%20Stokes%7CPersonnage%20de%20fiction%7CPersonnage%20fictif%7CPersonnage%20de%20fiction%7C"
+                       . "Personnages%20r%C3%A9currents%20dans%20Les%20Experts%7CRaymond%20Langston%7CRiley%20Adams%7C"
+                       . "Saison%201%20des%20Experts%7CSaison%2010%20des%20Experts%7CSaison%2011%20des%20Experts%7C"
+                       . "Saison%2012%20des%20Experts%7CSaison%202%20des%20Experts%7CSaison%203%20des%20Experts%7C"
+                       . "Saison%204%20des%20Experts%7CSaison%205%20des%20Experts%7CSaison%206%20des%20Experts%7C"
+                       . "Saison%207%20des%20Experts%7CSaison%208%20des%20Experts%7CSaison%209%20des%20Experts%7C"
+                       . "Sara%20Sidle%7CSofia%20Curtis%7CS%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e%7CWallace%20Langham%7C"
+                       . "Warrick%20Brown%7CWendy%20Simms%7C%C3%89tats-Unis"
+               );
+
+               $this->canRun = function_exists ( 'mb_check_encoding' );
+
+               if ( $this->canRun ) {\r
+                       $this->mDescription = "Benchmark for using a regexp vs. mb_check_encoding to check for UTF-8 encoding.";
+                       mb_internal_encoding( 'UTF-8' );
+               } else {
+                       $this->mDescription = "CANNOT RUN benchmark using mb_check_encoding: function not available.";
+               }\r
+       }\r
+\r
+       public function execute() {
+               if ( !$this->canRun ) {
+                       return;
+               }
+               $benchmarks = array();
+               foreach ($this->data as $val) {
+                       $benchmarks[] = array(
+                               'function' => array( $this, 'use_regexp' ),
+                               'args' => array( rawurldecode ( $val ) )
+                       );
+                       $benchmarks[] = array(
+                               'function' => array( $this, 'use_regexp_non_capturing' ),
+                               'args' => array( rawurldecode ( $val ) )
+                       );
+                       $benchmarks[] = array(
+                               'function' => array( $this, 'use_regexp_once_only' ),
+                               'args' => array( rawurldecode ( $val ) )
+                       );
+                       $benchmarks[] = array(\r
+                               'function' => array( $this, 'use_mb_check_encoding' ),\r
+                               'args' => array( rawurldecode ( $val ) )\r
+                       );
+               }\r
+               $this->bench( $benchmarks );
+               print $this->getFormattedResults();\r
+       }\r
+
+       private $isutf8;
+\r
+       function use_regexp( $s ) {
+               $this->isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
+                               '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );\r
+       }\r
+\r
+       function use_regexp_non_capturing( $s ) {
+               // Same as above with a non-capturing subgroup.
+               $this->isutf8 = preg_match( '/^(?:[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
+                               '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
+       }
+
+       function use_regexp_once_only( $s ) {
+               // Same as above with a once-only subgroup.
+               $this->isutf8 = preg_match( '/^(?>[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
+                               '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
+       }
+
+       function use_mb_check_encoding( $s ) {\r
+               $this->isutf8 = mb_check_encoding( $s, 'UTF-8' );\r
+       }\r
+\r
+}\r
+\r
+$maintClass = 'bench_utf8_title_check';\r
+require_once( RUN_MAINTENANCE_IF_MAIN );\r
index 4239670..a1a6d59 100644 (file)
@@ -791,4 +791,68 @@ class LanguageTest extends MediaWikiTestCase {
                        ),
                );
        }
+
+       /**\r
+        * @dataProvider provideCheckTitleEncodingData\r
+        */\r
+       function testCheckTitleEncoding( $s ) {\r
+               $this->assertEquals(\r
+                       $s,\r
+                       $this->lang->checkTitleEncoding($s),\r
+                       "checkTitleEncoding('$s')"\r
+               );\r
+       }\r
+
+       function provideCheckTitleEncodingData() {
+               return array (
+                       array( "" ),
+                       array( "United States of America" ), // 7bit ASCII
+                       array( rawurldecode( "S%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e" ) ),
+                       array(
+                               rawurldecode(
+                                       "Acteur%7CAlbert%20Robbins%7CAnglais%7CAnn%20Donahue%7CAnthony%20E.%20Zuiker%7CCarol%20Mendelsohn"
+                               )
+                       ),
+                       // The following two data sets come from bug 36839. They fail if checkTitleEncoding uses a regexp to test for
+                       // valid UTF-8 encoding and the pcre.recursion_limit is low (like, say, 1024). They succeed if checkTitleEncoding
+                   // uses mb_check_encoding for its test.
+                       array(
+                               rawurldecode(
+                                       "Acteur%7CAlbert%20Robbins%7CAnglais%7CAnn%20Donahue%7CAnthony%20E.%20Zuiker%7CCarol%20Mendelsohn%7C"
+                                       . "Catherine%20Willows%7CDavid%20Hodges%7CDavid%20Phillips%7CGil%20Grissom%7CGreg%20Sanders%7CHodges%7C"
+                                       . "Internet%20Movie%20Database%7CJim%20Brass%7CLady%20Heather%7C"
+                                       . "Les%20Experts%20(s%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e)%7CLes%20Experts%20:%20Manhattan%7C"
+                                       . "Les%20Experts%20:%20Miami%7CListe%20des%20personnages%20des%20Experts%7C"
+                                       . "Liste%20des%20%C3%A9pisodes%20des%20Experts%7CMod%C3%A8le%20discussion:Palette%20Les%20Experts%7C"
+                                       . "Nick%20Stokes%7CPersonnage%20de%20fiction%7CPersonnage%20fictif%7CPersonnage%20de%20fiction%7C"
+                                       . "Personnages%20r%C3%A9currents%20dans%20Les%20Experts%7CRaymond%20Langston%7CRiley%20Adams%7C"
+                                       . "Saison%201%20des%20Experts%7CSaison%2010%20des%20Experts%7CSaison%2011%20des%20Experts%7C"
+                                       . "Saison%2012%20des%20Experts%7CSaison%202%20des%20Experts%7CSaison%203%20des%20Experts%7C"
+                                       . "Saison%204%20des%20Experts%7CSaison%205%20des%20Experts%7CSaison%206%20des%20Experts%7C"
+                                       . "Saison%207%20des%20Experts%7CSaison%208%20des%20Experts%7CSaison%209%20des%20Experts%7C"
+                                       . "Sara%20Sidle%7CSofia%20Curtis%7CS%C3%A9rie%20t%C3%A9l%C3%A9vis%C3%A9e%7CWallace%20Langham%7C"
+                                       . "Warrick%20Brown%7CWendy%20Simms%7C%C3%89tats-Unis"
+                               ),
+                       ),
+                       array(
+                               rawurldecode(
+                                       "Mod%C3%A8le%3AArrondissements%20homonymes%7CMod%C3%A8le%3ABandeau%20standard%20pour%20page%20d'homonymie%7C"
+                                       . "Mod%C3%A8le%3ABatailles%20homonymes%7CMod%C3%A8le%3ACantons%20homonymes%7C"
+                                       . "Mod%C3%A8le%3ACommunes%20fran%C3%A7aises%20homonymes%7CMod%C3%A8le%3AFilms%20homonymes%7C"
+                                       . "Mod%C3%A8le%3AGouvernements%20homonymes%7CMod%C3%A8le%3AGuerres%20homonymes%7CMod%C3%A8le%3AHomonymie%7C"
+                                       . "Mod%C3%A8le%3AHomonymie%20bateau%7CMod%C3%A8le%3AHomonymie%20d'%C3%A9tablissements%20scolaires%20ou"
+                                       . "%20universitaires%7CMod%C3%A8le%3AHomonymie%20d'%C3%AEles%7CMod%C3%A8le%3AHomonymie%20de%20clubs%20sportifs%7C"
+                                       . "Mod%C3%A8le%3AHomonymie%20de%20comt%C3%A9s%7CMod%C3%A8le%3AHomonymie%20de%20monument%7C"
+                                       . "Mod%C3%A8le%3AHomonymie%20de%20nom%20romain%7CMod%C3%A8le%3AHomonymie%20de%20parti%20politique%7C"
+                                       . "Mod%C3%A8le%3AHomonymie%20de%20route%7CMod%C3%A8le%3AHomonymie%20dynastique%7C"
+                                       . "Mod%C3%A8le%3AHomonymie%20vid%C3%A9oludique%7CMod%C3%A8le%3AHomonymie%20%C3%A9difice%20religieux%7C"
+                                       . "Mod%C3%A8le%3AInternationalisation%7CMod%C3%A8le%3AIsom%C3%A9rie%7CMod%C3%A8le%3AParonymie%7C"
+                                       . "Mod%C3%A8le%3APatronyme%7CMod%C3%A8le%3APatronyme%20basque%7CMod%C3%A8le%3APatronyme%20italien%7C"
+                                       . "Mod%C3%A8le%3APatronymie%7CMod%C3%A8le%3APersonnes%20homonymes%7CMod%C3%A8le%3ASaints%20homonymes%7C"
+                                       . "Mod%C3%A8le%3ATitres%20homonymes%7CMod%C3%A8le%3AToponymie%7CMod%C3%A8le%3AUnit%C3%A9s%20homonymes%7C"
+                                       . "Mod%C3%A8le%3AVilles%20homonymes%7CMod%C3%A8le%3A%C3%89difices%20religieux%20homonymes"
+                               )
+                       )
+               );
+       }
 }