* (bug 31187) Fix for user JavaScript validation to allow identifiers with valid...

author Brion Vibber <brion@users.mediawiki.org>

Tue, 27 Sep 2011 22:51:36 +0000 (22:51 +0000)

committer Brion Vibber <brion@users.mediawiki.org>

Tue, 27 Sep 2011 22:51:36 +0000 (22:51 +0000)
author Brion Vibber <brion@users.mediawiki.org>
Tue, 27 Sep 2011 22:51:36 +0000 (22:51 +0000)
committer Brion Vibber <brion@users.mediawiki.org>
Tue, 27 Sep 2011 22:51:36 +0000 (22:51 +0000)
diff --git a/includes/libs/jsminplus.php b/includes/libs/jsminplus.php

index 5a3c5bd..8ed08d7 100644 (file)
--- a/includes/libs/jsminplus.php
+++ b/includes/libs/jsminplus.php
@@ -2029,13 +2029,55 @@ class JSTokenizer
                                 break;
  
                                 default:
-                                       // FIXME: add support for unicode and unicode escape sequence \uHHHH
-                                       if (preg_match('/^[$\w]+/', $input, $match))
+                                       // Fast path for identifiers: word chars followed by whitespace or various other tokens.
+                                       // Note we don't need to exclude digits in the first char, as they've already been found
+                                       // above.
+                                       if (!preg_match('/^[$\w]+(?=[\s\/\|\^\&<>\+\-\*%=!.;,\?:~\[\]\{\}\(\)@])/', $input, $match))
                                         {
-                                               $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
+                                               // Character classes per ECMA-262 edition 5.1 section 7.6
+                                               // Per spec, must accept Unicode 3.0, *may* accept later versions.
+                                               // We'll take whatever PCRE understands, which should be more recent.
+                                               $identifierStartChars = "\\p{L}\\p{Nl}" .  # UnicodeLetter
+                                                                       "\$" .
+                                                                       "_";
+                                               $identifierPartChars  = $identifierStartChars .
+                                                                       "\\p{Mn}\\p{Mc}" . # UnicodeCombiningMark
+                                                                       "\\p{Nd}" .        # UnicodeDigit
+                                                                       "\\p{Pc}";         # UnicodeConnectorPunctuation
+                                               $unicodeEscape = "\\\\u[0-9A-F-a-f]{4}";
+                                               $identifierRegex = "/^" .
+                                                                  "(?:[$identifierStartChars]|$unicodeEscape)" .
+                                                                  "(?:[$identifierPartChars]|$unicodeEscape)*" .
+                                                                  "/uS";
+                                               if (preg_match($identifierRegex, $input, $match))
+                                               {
+                                                       if (strpos($match[0], '\\') !== false) {
+                                                               // Per ECMA-262 edition 5.1, section 7.6 escape sequences should behave as if they were
+                                                               // the original chars, but only within the boundaries of the identifier.
+                                                               $decoded = preg_replace_callback('/\\\\u([0-9A-Fa-f]{4})/',
+                                                                               array(__CLASS__, 'unicodeEscapeCallback'),
+                                                                               $match[0]);
+
+                                                               // Since our original regex didn't de-escape the originals, we need to check for validity again.
+                                                               // No need to worry about token boundaries, as anything outside the identifier is illegal!
+                                                               if (!preg_match("/^[$identifierStartChars][$identifierPartChars]*$/u", $decoded)) {
+                                                                       throw $this->newSyntaxError('Illegal token');
+                                                               }
+
+                                                               // Per spec it _ought_ to work to use these escapes for keywords words as well...
+                                                               // but IE rejects them as invalid, while Firefox and Chrome treat them as identifiers
+                                                               // that don't match the keyword.
+                                                               if (in_array($decoded, $this->keywords)) {
+                                                                       throw $this->newSyntaxError('Illegal token');
+                                                               }
+
+                                                               // TODO: save the decoded form for output?
+                                                       }
+                                               }
+                                               else
+                                                       throw $this->newSyntaxError('Illegal token');
                                         }
-                                       else
-                                               throw $this->newSyntaxError('Illegal token');
+                                       $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
                         }
                 }
  
@@ -2073,6 +2115,11 @@ class JSTokenizer
         {
                 return new Exception('Parse error: ' . $m . ' in file \'' . $this->filename . '\' on line ' . $this->lineno);
         }
+
+       public static function unicodeEscapeCallback($m)
+       {
+               return html_entity_decode('&#x' . $m[1]. ';', ENT_QUOTES, 'UTF-8');
+       }
  }
  
  class JSToken
diff --git a/tests/phpunit/includes/libs/JavaScriptMinifierTest.php b/tests/phpunit/includes/libs/JavaScriptMinifierTest.php

index a0847b2..aa05500 100644 (file)
--- a/tests/phpunit/includes/libs/JavaScriptMinifierTest.php
+++ b/tests/phpunit/includes/libs/JavaScriptMinifierTest.php
@@ -78,6 +78,12 @@ class JavaScriptMinifierTest extends MediaWikiTestCase {
                         
                         // newline insertion after 1000 chars: break after the "++", not before
                         array( str_repeat( ';', 996 ) . "if(x++);", str_repeat( ';', 996 ) . "if(x++\n);" ),
+
+                       // Unicode letter characters should pass through ok in identifiers (bug 31187)
+                       array( "var KaŝSkatolVal = {}", 'var KaŝSkatolVal={}'),
+                       // And also per spec unicode char escape values should work in identifiers,
+                       // as long as it's a valid char. In future it might get normalized.
+                       array( "var Ka\\u015dSkatolVal = {}", 'var Ka\\u015dSkatolVal={}'),
                 );
         }
  
diff --git a/tests/qunit/index.html b/tests/qunit/index.html

index 25806e0..9187186 100644 (file)
--- a/tests/qunit/index.html
+++ b/tests/qunit/index.html
@@ -79,6 +79,7 @@
         <script src="data/testrunner.js"></script>
  
         <!-- QUnit: Load test suites (maintain the same order as above please) -->
+       <script src="suites/resources/mediawiki/mediawiki.jscompat.test.js"></script>
         <script src="suites/resources/mediawiki/mediawiki.test.js"></script>
         <script src="suites/resources/mediawiki/mediawiki.user.test.js"></script>
  
diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.jscompat.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.jscompat.test.js

new file mode 100644 (file)

index 0000000..52cd32c
--- /dev/null
+++ b/tests/qunit/suites/resources/mediawiki/mediawiki.jscompat.test.js
@@ -0,0 +1,35 @@
+/* Some misc JavaScript compatibility tests, just to make sure the environments we run in are consistent */
+
+module( 'mediawiki.jscompat' );
+
+test( 'Variable with Unicode letter in name', function() {
+       expect(3);
+       var orig = "some token";
+       var ŝablono = orig;
+       deepEqual( ŝablono, orig, 'ŝablono' );
+       deepEqual( \u015dablono, orig, '\\u015dablono' );
+       deepEqual( \u015Dablono, orig, '\\u015Dablono' );
+});
+
+/*
+// Not that we need this. ;)
+// This fails on IE 6-8
+// Works on IE 9, Firefox 6, Chrome 14
+test( 'Keyword workaround: "if" as variable name using Unicode escapes', function() {
+       var orig = "another token";
+       \u0069\u0066 = orig;
+       deepEqual( \u0069\u0066, orig, '\\u0069\\u0066' );
+});
+*/
+
+/*
+// Not that we need this. ;)
+// This fails on IE 6-9
+// Works on Firefox 6, Chrome 14
+test( 'Keyword workaround: "if" as member variable name using Unicode escapes', function() {
+       var orig = "another token";
+       var foo = {};
+       foo.\u0069\u0066 = orig;
+       deepEqual( foo.\u0069\u0066, orig, 'foo.\\u0069\\u0066' );
+});
+*/
author	Brion Vibber <brion@users.mediawiki.org>
	Tue, 27 Sep 2011 22:51:36 +0000 (22:51 +0000)
committer	Brion Vibber <brion@users.mediawiki.org>
	Tue, 27 Sep 2011 22:51:36 +0000 (22:51 +0000)
includes/libs/jsminplus.php		patch \| blob \| history
tests/phpunit/includes/libs/JavaScriptMinifierTest.php		patch \| blob \| history
tests/qunit/index.html		patch \| blob \| history
tests/qunit/suites/resources/mediawiki/mediawiki.jscompat.test.js	[new file with mode: 0644]	patch \| blob