Merge "Quoted attributes don't need to be followed by a space"
authorjenkins-bot <jenkins-bot@gerrit.wikimedia.org>
Tue, 27 Nov 2018 16:21:41 +0000 (16:21 +0000)
committerGerrit Code Review <gerrit@wikimedia.org>
Tue, 27 Nov 2018 16:21:41 +0000 (16:21 +0000)
includes/parser/Sanitizer.php
tests/parser/parserTests.txt

index 84f8083..f8c3bc2 100644 (file)
@@ -349,18 +349,18 @@ class Sanitizer {
 
        /**
         * Regular expression to match HTML/XML attribute pairs within a tag.
-        * Allows some... latitude. Based on,
-        * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
-        * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+        * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+        * Used in Sanitizer::decodeTagAttributes
         * @return string
         */
        static function getAttribsRegex() {
                if ( self::$attribsRegex === null ) {
-                       $attribFirst = "[:_\p{L}\p{N}]";
-                       $attrib = "[:_\.\-\p{L}\p{N}]";
-                       $space = '[\x09\x0a\x0c\x0d\x20]';
+                       $spaceChars = '\x09\x0a\x0c\x0d\x20';
+                       $space = "[{$spaceChars}]";
+                       $attrib = "[^{$spaceChars}\/>=]";
+                       $attribFirst = "(?:{$attrib}|=)";
                        self::$attribsRegex =
-                               "/(?:^|$space)({$attribFirst}{$attrib}*)
+                               "/({$attribFirst}{$attrib}*)
                                        ($space*=$space*
                                        (?:
                                                # The attribute value: quoted or alone
@@ -368,11 +368,29 @@ class Sanitizer {
                                                | '([^']*)(?:'|\$)
                                                | (((?!$space|>).)*)
                                        )
-                               )?(?=$space|\$)/sxu";
+                               )?/sxu";
                }
                return self::$attribsRegex;
        }
 
+       /**
+        * Lazy-initialised attribute name regex, see getAttribNameRegex()
+        */
+       private static $attribNameRegex;
+
+       /**
+        * Used in Sanitizer::decodeTagAttributes to filter attributes.
+        * @return string
+        */
+       static function getAttribNameRegex() {
+               if ( self::$attribNameRegex === null ) {
+                       $attribFirst = "[:_\p{L}\p{N}]";
+                       $attrib = "[:_\.\-\p{L}\p{N}]";
+                       self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
+               }
+               return self::$attribNameRegex;
+       }
+
        /**
         * Return the various lists of recognized tags
         * @param array $extratags For any extra tags to include
@@ -1434,18 +1452,24 @@ class Sanitizer {
                        return [];
                }
 
-               $attribs = [];
                $pairs = [];
                if ( !preg_match_all(
                        self::getAttribsRegex(),
                        $text,
                        $pairs,
                        PREG_SET_ORDER ) ) {
-                       return $attribs;
+                       return [];
                }
 
+               $attribs = [];
                foreach ( $pairs as $set ) {
                        $attribute = strtolower( $set[1] );
+
+                       // Filter attribute names with unacceptable characters
+                       if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
+                               continue;
+                       }
+
                        $value = self::getTagAttributeCallback( $set );
 
                        // Normalize whitespace
index c4789e2..fdf1d5b 100644 (file)
@@ -6273,8 +6273,6 @@ parsoid=wt2html
 
 !! end
 
-# Note that the PHP parser output appears to be broken when the table
-# end tag is not separated by a space from the style attribute
 !! test
 A table with stray table end tags on start tag line (wt2html)
 !! options
@@ -6294,13 +6292,13 @@ parsoid=wt2html
 |foo
 |}
 !! html/php+tidy
-<table style="&quot;color:">
+<table style="color: red;">
 
 </table><table style="color: red;">
 <tbody><tr>
 <td>foo
 </td></tr></tbody></table>
-<table style="&quot;color:" id="foo">
+<table style="color: red;" id="foo">
 <tbody><tr>
 <td>foo
 </td></tr></tbody></table>
@@ -9652,17 +9650,14 @@ Handling html with a div self-closing tag
 <div title=bar />
 <div title=bar/>
 <div title=bar/ >
-!! html/php
-<p>&lt;div title /&gt;
-&lt;div title/&gt;
-</p>
-<div>
-<p>&lt;div title=bar /&gt;
-&lt;div title=bar/&gt;
-</p>
-<div title="bar/"></div>
-</div>
-
+!! html/php+tidy
+<div title=""></div>
+<div title=""></div>
+<div title="">
+<div title="bar"></div>
+<div title="bar"></div>
+<div title="bar/">
+</div></div>
 !! html/parsoid
 <div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
 <div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
@@ -9703,10 +9698,10 @@ Handling html with a br self-closing tag
 <br title=bar />
 <br title=bar/>
 <br title=bar/ >
-!! html/php
+!! html/php+tidy
 <p><br title="" />
 <br title="" />
-<br />
+<br title="" />
 <br title="bar" />
 <br title="bar" />
 <br title="bar/" />
@@ -9721,6 +9716,18 @@ Handling html with a br self-closing tag
 </p>
 !! end
 
+!! test
+Quoted attributes without spaces
+!! options
+parsoid=wt2html
+!! wikitext
+<div class="foo"style="color:red">red</div>
+!! html/php+tidy
+<div class="foo" style="color:red">red</div>
+!! html/parsoid
+<div class="foo" style="color:red">red</div>
+!! end
+
 !! test
 Horizontal ruler (should it add that extra space?)
 !! wikitext
@@ -18213,8 +18220,7 @@ HTML tag with leading space is parsed as text
 </p>
 !! end
 
-## Don't expect Parsoid and PHP to match, since PHP isn't exactly following
-## the HTML5 parsing spec.
+## FIXME: The untrimmed attribute in Parsoid is T205737
 !! test
 Element with broken attribute syntax
 !! options
@@ -18223,7 +18229,7 @@ parsoid=wt2html
 <div style=" style="123">hi</div>
 <div =>ho</div>
 !! html/php
-<div style="123">hi</div>
+<div style="style=">hi</div>
 <div>ho</div>
 
 !! html/parsoid