Match html5 unquoted attribute parsing

author Arlo Breault <abreault@wikimedia.org>

Wed, 5 Aug 2015 18:40:02 +0000 (11:40 -0700)

committer Tim Starling <tstarling@wikimedia.org>

Mon, 22 Feb 2016 00:50:06 +0000 (00:50 +0000)
author Arlo Breault <abreault@wikimedia.org>
Wed, 5 Aug 2015 18:40:02 +0000 (11:40 -0700)
committer Tim Starling <tstarling@wikimedia.org>
Mon, 22 Feb 2016 00:50:06 +0000 (00:50 +0000)
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index b84adc3..d52bc07 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -332,7 +332,8 @@ class Sanitizer {
  
         /**
          * Regular expression to match HTML/XML attribute pairs within a tag.
-        * Allows some... latitude.
+        * Allows some... latitude. Based on,
+        * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
          * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
          * @return string
          */
@@ -340,15 +341,15 @@ class Sanitizer {
                 if ( self::$attribsRegex === null ) {
                         $attribFirst = '[:A-Z_a-z0-9]';
                         $attrib = '[:A-Z_a-z-.0-9]';
-                       $space = '[\x09\x0a\x0d\x20]';
+                       $space = '[\x09\x0a\x0c\x0d\x20]';
                         self::$attribsRegex =
                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
                                   ($space*=$space*
                                         (?:
                                          # The attribute value: quoted or alone
-                                         \"([^<\"]*)(?:\"|\$)
-                                        | '([^<']*)(?:'|\$)
-                                        |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+                                         \"([^\"]*)(?:\"|\$)
+                                        | '([^']*)(?:'|\$)
+                                        |  (((?!$space|>).)*)
                                         )
                                 )?(?=$space|\$)/sx";
                 }
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt

index d4e7119..9ad7bd5 100644 (file)
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -6552,14 +6552,18 @@ Table with empty line following the start tag
  
  !! end
  
-# FIXME: Preserve the attribute properly (with an empty string as value) in
-# the PHP parser. Parsoid implements the behavior below.
  !! test
  Table attributes with empty value
  !! wikitext
  {|
  | style=| hello
  |}
+!! html/php
+<table>
+<tr>
+<td style=""> hello
+</td></tr></table>
+
  !! html/parsoid
  <table>
  <tbody>
@@ -15451,12 +15455,8 @@ div with illegal double attributes
  
  !!end
  
-# FIXME: produce empty string instead of "class" in the PHP parser, following
-# the HTML5 spec.
  !! test
  div with empty attribute value, space before equals
-!! options
-parsoid
  !! wikitext
  <div class =>HTML rocks</div>
  !! html
@@ -15464,26 +15464,36 @@ parsoid
  
  !! end
  
+# FIXME: Parsoid doesn't match the html5 spec
  !! test
  div with multiple empty attribute values
  !! options
-parsoid
+parsoid=wt2html,html2html
  !! wikitext
  <div id= title=>HTML rocks</div>
-!! html
-<div id="" title="">HTML rocks</div>
+!! html/php
+<div id="title.3D">HTML rocks</div>
  
+!! html/parsoid
+<div id="" title="">HTML rocks</div>
  !! end
  
+# FIXME: Parsoid doesn't match the html5 spec
  !! test
  table with multiple empty attribute values
  !! options
-parsoid
+parsoid=wt2html,html2html
  !! wikitext
  {| title= id=
  | hi
  |}
-!! html
+!! html/php
+<table title="id=">
+<tr>
+<td> hi
+</td></tr></table>
+
+!! html/parsoid
  <table title="" id="">
  <tbody><tr><td> hi</td></tr>
  </tbody></table>
@@ -15500,13 +15510,6 @@ div with braces in attribute value
  <div title="{}">Foo</div>
  !! end
  
-# This it very inconsistent in the PHP parser: it returns 
-# class="class" if there is a space between the name and the equal sign (see
-# 'div with empty attribute value, space before equals'), but strips the
-# attribute completely if the space is missing. We hope that not much content
-# depends on this, so are implementing the behavior below in Parsoid for
-# consistencies' sake.
-# FIXME: fix this behavior in the PHP parser?
  !! test
  div with empty attribute value, no space before equals
  !! options
@@ -15514,7 +15517,7 @@ parsoid=wt2html,html2html
  !! wikitext
  <div class=>HTML rocks</div>
  !! html/php
-<div>HTML rocks</div>
+<div class="">HTML rocks</div>
  
  !! html/parsoid
  <div class="">HTML rocks</div>
@@ -15836,7 +15839,7 @@ Attribute test: equals, then nothing
  !! wikitext
  <font color=>foo</font>
  !! html
-<p><font>foo</font>
+<p><font color="">foo</font>
  </p>
  !! end
  
@@ -23909,14 +23912,15 @@ HTML tag with 'unnecessary' entity encoding in attributes
  
  !! test
  HTML tag with broken attribute value quoting
+!! options
+parsoid=wt2html,html2html
  !! wikitext
  <span title="Hello world>Foo</span>
  !! html/php
  <p><span title="Hello world">Foo</span>
  </p>
  !! html/parsoid
-<p><span title="Hello world">Foo</span>
-</p>
+<p><span title="Hello world">Foo</span></p>
  !! end
  
  !! test
@@ -23934,6 +23938,8 @@ parsoid=wt2html,html2html
  
  !! test
  Table with broken attribute value quoting
+!! options
+parsoid=wt2html,html2html
  !! wikitext
  {|
  | title="Hello world|Foo
@@ -23954,6 +23960,8 @@ Table with broken attribute value quoting
  
  !! test
  Table with broken attribute value quoting on consecutive lines
+!! options
+parsoid=wt2html,html2html
  !! wikitext
  {|
  | title="Hello world|Foo
author	Arlo Breault <abreault@wikimedia.org>
	Wed, 5 Aug 2015 18:40:02 +0000 (11:40 -0700)
committer	Tim Starling <tstarling@wikimedia.org>
	Mon, 22 Feb 2016 00:50:06 +0000 (00:50 +0000)
includes/Sanitizer.php		patch \| blob \| history
tests/parser/parserTests.txt		patch \| blob \| history