Match html5 unquoted attribute parsing
[lhc/web/wiklou.git] / includes / Sanitizer.php
index b84adc3..d52bc07 100644 (file)
@@ -332,7 +332,8 @@ class Sanitizer {
 
        /**
         * Regular expression to match HTML/XML attribute pairs within a tag.
-        * Allows some... latitude.
+        * Allows some... latitude. Based on,
+        * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
         * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
         * @return string
         */
@@ -340,15 +341,15 @@ class Sanitizer {
                if ( self::$attribsRegex === null ) {
                        $attribFirst = '[:A-Z_a-z0-9]';
                        $attrib = '[:A-Z_a-z-.0-9]';
-                       $space = '[\x09\x0a\x0d\x20]';
+                       $space = '[\x09\x0a\x0c\x0d\x20]';
                        self::$attribsRegex =
                                "/(?:^|$space)({$attribFirst}{$attrib}*)
                                  ($space*=$space*
                                        (?:
                                         # The attribute value: quoted or alone
-                                         \"([^<\"]*)(?:\"|\$)
-                                        | '([^<']*)(?:'|\$)
-                                        |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+                                         \"([^\"]*)(?:\"|\$)
+                                        | '([^']*)(?:'|\$)
+                                        |  (((?!$space|>).)*)
                                        )
                                )?(?=$space|\$)/sx";
                }