Quoted attributes don't need to be followed by a space
authorArlo Breault <abreault@wikimedia.org>
Fri, 2 Nov 2018 23:20:52 +0000 (19:20 -0400)
committerArlo Breault <abreault@wikimedia.org>
Fri, 9 Nov 2018 21:00:18 +0000 (16:00 -0500)
Further, this splits up attribute parsing from filtering.

Change-Id: Ib4e0a808a6ca2ba032873e885837233e2f2feefe

includes/parser/Sanitizer.php
tests/parser/parserTests.txt

index 85c71ee..1c8a797 100644 (file)
@@ -349,18 +349,18 @@ class Sanitizer {
 
        /**
         * Regular expression to match HTML/XML attribute pairs within a tag.
-        * Allows some... latitude. Based on,
-        * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
-        * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+        * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+        * Used in Sanitizer::decodeTagAttributes
         * @return string
         */
        static function getAttribsRegex() {
                if ( self::$attribsRegex === null ) {
-                       $attribFirst = "[:_\p{L}\p{N}]";
-                       $attrib = "[:_\.\-\p{L}\p{N}]";
-                       $space = '[\x09\x0a\x0c\x0d\x20]';
+                       $spaceChars = '\x09\x0a\x0c\x0d\x20';
+                       $space = "[{$spaceChars}]";
+                       $attrib = "[^{$spaceChars}\/>=]";
+                       $attribFirst = "(?:{$attrib}|=)";
                        self::$attribsRegex =
-                               "/(?:^|$space)({$attribFirst}{$attrib}*)
+                               "/({$attribFirst}{$attrib}*)
                                        ($space*=$space*
                                        (?:
                                                # The attribute value: quoted or alone
@@ -368,11 +368,29 @@ class Sanitizer {
                                                | '([^']*)(?:'|\$)
                                                | (((?!$space|>).)*)
                                        )
-                               )?(?=$space|\$)/sxu";
+                               )?/sxu";
                }
                return self::$attribsRegex;
        }
 
+       /**
+        * Lazy-initialised attribute name regex, see getAttribNameRegex()
+        */
+       private static $attribNameRegex;
+
+       /**
+        * Used in Sanitizer::decodeTagAttributes to filter attributes.
+        * @return string
+        */
+       static function getAttribNameRegex() {
+               if ( self::$attribNameRegex === null ) {
+                       $attribFirst = "[:_\p{L}\p{N}]";
+                       $attrib = "[:_\.\-\p{L}\p{N}]";
+                       self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
+               }
+               return self::$attribNameRegex;
+       }
+
        /**
         * Return the various lists of recognized tags
         * @param array $extratags For any extra tags to include
@@ -1433,18 +1451,24 @@ class Sanitizer {
                        return [];
                }
 
-               $attribs = [];
                $pairs = [];
                if ( !preg_match_all(
                        self::getAttribsRegex(),
                        $text,
                        $pairs,
                        PREG_SET_ORDER ) ) {
-                       return $attribs;
+                       return [];
                }
 
+               $attribs = [];
                foreach ( $pairs as $set ) {
                        $attribute = strtolower( $set[1] );
+
+                       // Filter attribute names with unacceptable characters
+                       if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
+                               continue;
+                       }
+
                        $value = self::getTagAttributeCallback( $set );
 
                        // Normalize whitespace
index 6d5054e..62983f5 100644 (file)
@@ -6273,8 +6273,6 @@ parsoid=wt2html
 
 !! end
 
-# Note that the PHP parser output appears to be broken when the table
-# end tag is not separated by a space from the style attribute
 !! test
 A table with stray table end tags on start tag line (wt2html)
 !! options
@@ -6294,13 +6292,13 @@ parsoid=wt2html
 |foo
 |}
 !! html/php+tidy
-<table style="&quot;color:">
+<table style="color: red;">
 
 </table><table style="color: red;">
 <tbody><tr>
 <td>foo
 </td></tr></tbody></table>
-<table style="&quot;color:" id="foo">
+<table style="color: red;" id="foo">
 <tbody><tr>
 <td>foo
 </td></tr></tbody></table>
@@ -9648,17 +9646,14 @@ Handling html with a div self-closing tag
 <div title=bar />
 <div title=bar/>
 <div title=bar/ >
-!! html/php
-<p>&lt;div title /&gt;
-&lt;div title/&gt;
-</p>
-<div>
-<p>&lt;div title=bar /&gt;
-&lt;div title=bar/&gt;
-</p>
-<div title="bar/"></div>
-</div>
-
+!! html/php+tidy
+<div title=""></div>
+<div title=""></div>
+<div title="">
+<div title="bar"></div>
+<div title="bar"></div>
+<div title="bar/">
+</div></div>
 !! html/parsoid
 <div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
 <div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
@@ -9699,10 +9694,10 @@ Handling html with a br self-closing tag
 <br title=bar />
 <br title=bar/>
 <br title=bar/ >
-!! html/php
+!! html/php+tidy
 <p><br title="" />
 <br title="" />
-<br />
+<br title="" />
 <br title="bar" />
 <br title="bar" />
 <br title="bar/" />
@@ -9717,6 +9712,18 @@ Handling html with a br self-closing tag
 </p>
 !! end
 
+!! test
+Quoted attributes without spaces
+!! options
+parsoid=wt2html
+!! wikitext
+<div class="foo"style="color:red">red</div>
+!! html/php+tidy
+<div class="foo" style="color:red">red</div>
+!! html/parsoid
+<div class="foo" style="color:red">red</div>
+!! end
+
 !! test
 Horizontal ruler (should it add that extra space?)
 !! wikitext
@@ -18177,8 +18184,7 @@ HTML tag with leading space is parsed as text
 </p>
 !! end
 
-## Don't expect Parsoid and PHP to match, since PHP isn't exactly following
-## the HTML5 parsing spec.
+## FIXME: The untrimmed attribute in Parsoid is T205737
 !! test
 Element with broken attribute syntax
 !! options
@@ -18187,7 +18193,7 @@ parsoid=wt2html
 <div style=" style="123">hi</div>
 <div =>ho</div>
 !! html/php
-<div style="123">hi</div>
+<div style="style=">hi</div>
 <div>ho</div>
 
 !! html/parsoid