Commenting and doc fixes around the spam regexes
[lhc/web/wiklou.git] / includes / Sanitizer.php
index b4a1c62..d09e8d9 100644 (file)
@@ -56,7 +56,7 @@ class Sanitizer {
         * As well as ' which is only defined starting in XHTML1.
         * @private
         */
-       static $htmlEntities = array(
+       private static $htmlEntities = array(
                'Aacute'   => 193,
                'aacute'   => 225,
                'Acirc'    => 194,
@@ -315,15 +315,16 @@ class Sanitizer {
        /**
         * Character entity aliases accepted by MediaWiki
         */
-       static $htmlEntityAliases = array(
+       private static $htmlEntityAliases = array(
                'רלמ' => 'rlm',
                'رلم' => 'rlm',
        );
 
        /**
         * Lazy-initialised attributes regex, see getAttribsRegex()
+        * @private
         */
-       static $attribsRegex;
+       private static $attribsRegex;
 
        /**
         * Regular expression to match HTML/XML attribute pairs within a tag.
@@ -357,13 +358,16 @@ class Sanitizer {
         * removes HTML comments
         * @private
         * @param $text String
-        * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
+        * @param $processCallback Callback to do any variable or parameter
+        *        replacements in HTML attribute values
         * @param array $args for the processing callback
         * @param array $extratags for any extra tags to include
         * @param array $removetags for any tags (default or extra) to exclude
         * @return string
         */
-       static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
+       static function removeHTMLtags( $text, $processCallback = null,
+               $args = array(), $extratags = array(), $removetags = array()
+       ) {
                global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
 
                static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
@@ -385,10 +389,10 @@ class Sanitizer {
                                'kbd', 'samp', 'data', 'time', 'mark'
                        );
                        $htmlsingle = array(
-                               'br', 'hr', 'li', 'dt', 'dd'
+                               'br', 'wbr', 'hr', 'li', 'dt', 'dd'
                        );
                        $htmlsingleonly = array( # Elements that cannot have close tags
-                               'br', 'hr'
+                               'br', 'wbr', 'hr'
                        );
                        if ( $wgAllowMicrodataAttributes ) {
                                $htmlsingle[] = $htmlsingleonly[] = 'meta';
@@ -397,7 +401,7 @@ class Sanitizer {
                        $htmlnest = array( # Tags that can be nested--??
                                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
                                'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
-                               'var', 'kbd', 'samp'
+                               'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
                        );
                        $tabletags = array( # Can only appear inside table, we will close them
                                'td', 'th', 'tr',
@@ -444,7 +448,7 @@ class Sanitizer {
                                # $params: String between element name and >
                                # $brace: Ending '>' or '/>'
                                # $rest: Everything until the next element of $bits
-                               if ( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
+                               if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
                                        list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
                                } else {
                                        $slash = $t = $params = $brace = $rest = null;
@@ -624,7 +628,8 @@ class Sanitizer {
                        while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
                                $spaceLen++;
                        }
-                       if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
+                       if ( substr( $text, $spaceStart, 1 ) === "\n"
+                               && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
                                # Remove the comment, leading and trailing
                                # spaces, and leave only one newline.
                                $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
@@ -740,7 +745,7 @@ class Sanitizer {
 
                        # WAI-ARIA
                        # http://www.w3.org/TR/wai-aria/
-                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
+                       # http://www.whatwg.org/html/elements.html#wai-aria
                        # For now we only support role="presentation" until we work out what roles should be
                        # usable by content and we ensure that our code explicitly rejects patterns that
                        # violate HTML5's ARIA restrictions.
@@ -748,13 +753,18 @@ class Sanitizer {
                                continue;
                        }
 
-                       //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
-                       if ( $attribute === 'rel' || $attribute === 'rev' ||
-                               $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
-                               $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
-                               $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
-                               $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
-
+                       // RDFa and microdata properties allow URLs, URIs and/or CURIs.
+                       // Check them for sanity.
+                       if ( $attribute === 'rel' || $attribute === 'rev'
+                               # RDFa
+                               || $attribute === 'about' || $attribute === 'property'
+                               || $attribute === 'resource' || $attribute === 'datatype'
+                               || $attribute === 'typeof'
+                               # HTML5 microdata
+                               || $attribute === 'itemid' || $attribute === 'itemprop'
+                               || $attribute === 'itemref' || $attribute === 'itemscope'
+                               || $attribute === 'itemtype'
+                       ) {
                                //Paranoia. Allow "simple" values but suppress javascript
                                if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
                                        continue;
@@ -766,7 +776,7 @@ class Sanitizer {
                        if ( $attribute === 'href' || $attribute === 'src' ) {
                                if ( !preg_match( $hrefExp, $value ) ) {
                                        continue; //drop any href or src attributes not using an allowed protocol.
-                                                 //NOTE: this also drops all relative URLs
+                                       // NOTE: this also drops all relative URLs
                                }
                        }
 
@@ -1011,7 +1021,7 @@ class Sanitizer {
         *                                                          in the id and
         *                                                          name attributes
         * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
-        * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+        * @see http://www.whatwg.org/html/elements.html#the-id-attribute
         *   HTML5 definition of id attribute
         *
         * @param string $id id to escape
@@ -1460,14 +1470,16 @@ class Sanitizer {
                );
 
                if ( $wgAllowRdfaAttributes ) {
-                       #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+                       # RDFa attributes as specified in section 9 of
+                       # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
                        $common = array_merge( $common, array(
                                'about', 'property', 'resource', 'datatype', 'typeof',
                        ) );
                }
 
                if ( $wgAllowMicrodataAttributes ) {
-                       # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+                       # add HTML5 microdata tags as specified by
+                       # http://www.whatwg.org/html/microdata.html#the-microdata-model
                        $common = array_merge( $common, array(
                                'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
                        ) );
@@ -1536,6 +1548,9 @@ class Sanitizer {
                        # 9.3.2
                        'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 
+                       # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
+                       'wbr'        => array( 'id', 'class', 'title', 'style' ),
+
                        # 9.3.4
                        'pre'        => array_merge( $common, array( 'width' ) ),
 
@@ -1579,7 +1594,9 @@ class Sanitizer {
                        'td'         => array_merge( $common, $tablecell, $tablealign ),
                        'th'         => array_merge( $common, $tablecell, $tablealign ),
 
-                       # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
+                       # 12.2
+                       # NOTE: <a> is not allowed directly, but the attrib
+                       # whitelist is used from the Parser object
                        'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
 
                        # 13.2
@@ -1606,7 +1623,7 @@ class Sanitizer {
                        'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 
                        # HTML Ruby annotation text module, simple ruby only.
-                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#the-ruby-element
+                       # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
                        'ruby'       => $common,
                        # rbc
                        # rtc
@@ -1623,7 +1640,7 @@ class Sanitizer {
                        'bdi' => $common,
 
                        # HTML5 elements, defined by:
-                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/
+                       # http://www.whatwg.org/html/
                        'data' => array_merge( $common, array( 'value' ) ),
                        'time' => array_merge( $common, array( 'datetime' ) ),
                        'mark' => $common,
@@ -1740,7 +1757,7 @@ class Sanitizer {
         * Does a string look like an e-mail address?
         *
         * This validates an email address using an HTML5 specification found at:
-        * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+        * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
         * Which as of 2011-01-24 says:
         *
         *   A valid e-mail address is a string that matches the ABNF production