* (bug 31187) Fix for user JavaScript validation to allow identifiers with valid...
authorBrion Vibber <brion@users.mediawiki.org>
Tue, 27 Sep 2011 22:51:36 +0000 (22:51 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Tue, 27 Sep 2011 22:51:36 +0000 (22:51 +0000)
Followup r91591, r93020: patch to jsminplus to support Unicode chars and char escapes in identifiers

Fast-path check keeps runtime about the same on most scripts (eg jquery.js parsing was abround 4100ms both before and after on my test machine)
Slow-path code kicks in if plain ASCII word chars don't extend all the way to the next whitespace or punctuation char.
Using PCRE's Unicode properties magic to ensure that we're catching everything, following ECMA-262 edition 5.1 spec.

Note that identifiers using escapes don't get normalized to their UTF-8 form; this might be a nice thing to do as it saves a couple bytes, but currently there's no change made to output.

Added QUnit tests to verify that unicode letter & escapes work in identifiers in all supported browsers (ok back to IE 6, yay)

includes/libs/jsminplus.php
tests/phpunit/includes/libs/JavaScriptMinifierTest.php
tests/qunit/index.html
tests/qunit/suites/resources/mediawiki/mediawiki.jscompat.test.js [new file with mode: 0644]

index 5a3c5bd..8ed08d7 100644 (file)
@@ -2029,13 +2029,55 @@ class JSTokenizer
                                break;
 
                                default:
-                                       // FIXME: add support for unicode and unicode escape sequence \uHHHH
-                                       if (preg_match('/^[$\w]+/', $input, $match))
+                                       // Fast path for identifiers: word chars followed by whitespace or various other tokens.
+                                       // Note we don't need to exclude digits in the first char, as they've already been found
+                                       // above.
+                                       if (!preg_match('/^[$\w]+(?=[\s\/\|\^\&<>\+\-\*%=!.;,\?:~\[\]\{\}\(\)@])/', $input, $match))
                                        {
-                                               $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
+                                               // Character classes per ECMA-262 edition 5.1 section 7.6
+                                               // Per spec, must accept Unicode 3.0, *may* accept later versions.
+                                               // We'll take whatever PCRE understands, which should be more recent.
+                                               $identifierStartChars = "\\p{L}\\p{Nl}" .  # UnicodeLetter
+                                                                       "\$" .
+                                                                       "_";
+                                               $identifierPartChars  = $identifierStartChars .
+                                                                       "\\p{Mn}\\p{Mc}" . # UnicodeCombiningMark
+                                                                       "\\p{Nd}" .        # UnicodeDigit
+                                                                       "\\p{Pc}";         # UnicodeConnectorPunctuation
+                                               $unicodeEscape = "\\\\u[0-9A-F-a-f]{4}";
+                                               $identifierRegex = "/^" .
+                                                                  "(?:[$identifierStartChars]|$unicodeEscape)" .
+                                                                  "(?:[$identifierPartChars]|$unicodeEscape)*" .
+                                                                  "/uS";
+                                               if (preg_match($identifierRegex, $input, $match))
+                                               {
+                                                       if (strpos($match[0], '\\') !== false) {
+                                                               // Per ECMA-262 edition 5.1, section 7.6 escape sequences should behave as if they were
+                                                               // the original chars, but only within the boundaries of the identifier.
+                                                               $decoded = preg_replace_callback('/\\\\u([0-9A-Fa-f]{4})/',
+                                                                               array(__CLASS__, 'unicodeEscapeCallback'),
+                                                                               $match[0]);
+
+                                                               // Since our original regex didn't de-escape the originals, we need to check for validity again.
+                                                               // No need to worry about token boundaries, as anything outside the identifier is illegal!
+                                                               if (!preg_match("/^[$identifierStartChars][$identifierPartChars]*$/u", $decoded)) {
+                                                                       throw $this->newSyntaxError('Illegal token');
+                                                               }
+
+                                                               // Per spec it _ought_ to work to use these escapes for keywords words as well...
+                                                               // but IE rejects them as invalid, while Firefox and Chrome treat them as identifiers
+                                                               // that don't match the keyword.
+                                                               if (in_array($decoded, $this->keywords)) {
+                                                                       throw $this->newSyntaxError('Illegal token');
+                                                               }
+
+                                                               // TODO: save the decoded form for output?
+                                                       }
+                                               }
+                                               else
+                                                       throw $this->newSyntaxError('Illegal token');
                                        }
-                                       else
-                                               throw $this->newSyntaxError('Illegal token');
+                                       $tt = in_array($match[0], $this->keywords) ? $match[0] : TOKEN_IDENTIFIER;
                        }
                }
 
@@ -2073,6 +2115,11 @@ class JSTokenizer
        {
                return new Exception('Parse error: ' . $m . ' in file \'' . $this->filename . '\' on line ' . $this->lineno);
        }
+
+       public static function unicodeEscapeCallback($m)
+       {
+               return html_entity_decode('&#x' . $m[1]. ';', ENT_QUOTES, 'UTF-8');
+       }
 }
 
 class JSToken
index a0847b2..aa05500 100644 (file)
@@ -78,6 +78,12 @@ class JavaScriptMinifierTest extends MediaWikiTestCase {
                        
                        // newline insertion after 1000 chars: break after the "++", not before
                        array( str_repeat( ';', 996 ) . "if(x++);", str_repeat( ';', 996 ) . "if(x++\n);" ),
+
+                       // Unicode letter characters should pass through ok in identifiers (bug 31187)
+                       array( "var KaŝSkatolVal = {}", 'var KaŝSkatolVal={}'),
+                       // And also per spec unicode char escape values should work in identifiers,
+                       // as long as it's a valid char. In future it might get normalized.
+                       array( "var Ka\\u015dSkatolVal = {}", 'var Ka\\u015dSkatolVal={}'),
                );
        }
 
index 25806e0..9187186 100644 (file)
@@ -79,6 +79,7 @@
        <script src="data/testrunner.js"></script>
 
        <!-- QUnit: Load test suites (maintain the same order as above please) -->
+       <script src="suites/resources/mediawiki/mediawiki.jscompat.test.js"></script>
        <script src="suites/resources/mediawiki/mediawiki.test.js"></script>
        <script src="suites/resources/mediawiki/mediawiki.user.test.js"></script>
 
diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.jscompat.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.jscompat.test.js
new file mode 100644 (file)
index 0000000..52cd32c
--- /dev/null
@@ -0,0 +1,35 @@
+/* Some misc JavaScript compatibility tests, just to make sure the environments we run in are consistent */
+
+module( 'mediawiki.jscompat' );
+
+test( 'Variable with Unicode letter in name', function() {
+       expect(3);
+       var orig = "some token";
+       var ŝablono = orig;
+       deepEqual( ŝablono, orig, 'ŝablono' );
+       deepEqual( \u015dablono, orig, '\\u015dablono' );
+       deepEqual( \u015Dablono, orig, '\\u015Dablono' );
+});
+
+/*
+// Not that we need this. ;)
+// This fails on IE 6-8
+// Works on IE 9, Firefox 6, Chrome 14
+test( 'Keyword workaround: "if" as variable name using Unicode escapes', function() {
+       var orig = "another token";
+       \u0069\u0066 = orig;
+       deepEqual( \u0069\u0066, orig, '\\u0069\\u0066' );
+});
+*/
+
+/*
+// Not that we need this. ;)
+// This fails on IE 6-9
+// Works on Firefox 6, Chrome 14
+test( 'Keyword workaround: "if" as member variable name using Unicode escapes', function() {
+       var orig = "another token";
+       var foo = {};
+       foo.\u0069\u0066 = orig;
+       deepEqual( foo.\u0069\u0066, orig, 'foo.\\u0069\\u0066' );
+});
+*/