* (bug 2067) Fixed crash on empty quoted HTML attribute
[lhc/web/wiklou.git] / includes / Sanitizer.php
index 757b46c..faac940 100644 (file)
@@ -22,6 +22,7 @@
  * http://www.gnu.org/copyleft/gpl.html
  *
  * @package MediaWiki
+ * @subpackage Parser
  */
 
 class Sanitizer {
@@ -29,6 +30,8 @@ class Sanitizer {
         * Cleans up HTML, removes dangerous tags and attributes, and
         * removes HTML comments
         * @access private
+        * @param string $text
+        * @return string
         */
        function removeHTMLtags( $text ) {
                global $wgUseTidy, $wgUserHtml;
@@ -149,6 +152,8 @@ class Sanitizer {
         * trailing spaces and one of the newlines.
         * 
         * @access private
+        * @param string $text
+        * @return string
         */
        function removeHTMLcomments( $text ) {
                $fname='Parser::removeHTMLcomments';
@@ -211,7 +216,10 @@ class Sanitizer {
                        return '';
                }
                
-               $attrib = '[A-Za-z0-9]'; #FIXME
+               # Unquoted attribute
+               # Since we quote this later, this can be anything distinguishable 
+               # from the end of the attribute
+               $attrib = '[A-Za-z0-9]'; 
                $space = '[\x09\x0a\x0d\x20]';
                if( !preg_match_all(
                        "/(?:^|$space)($attrib+)
@@ -220,7 +228,7 @@ class Sanitizer {
                             # The attribute value: quoted or alone
                              \"([^<\"]*)\"
                             | '([^<']*)'
-                            |  ([a-zA-Z0-9._:-]+)
+                            |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
                             |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
                                                 # colors are specified like this.
                                                 # We'll be normalizing it.
@@ -238,25 +246,24 @@ class Sanitizer {
                        $attribute = strtolower( $set[1] );
                        if( !isset( $whitelist[$attribute] ) ) {
                                continue;
-                       }
-                       if( $set[2] == '' ) {
-                               # In XHTML, attributes must have a value.
-                               $value = $set[1];
-                       } elseif( $set[3] != '' ) {
-                               # Double-quoted
-                               $value = Sanitizer::normalizeAttributeValue( $set[3] );
-                       } elseif( $set[4] != '' ) {
+                       } elseif( isset( $set[6] ) ) {
+                               # Illegal #XXXXXX color with no quotes.
+                               $value = Sanitizer::normalizeAttributeValue( $set[6] );
+                       } elseif( isset( $set[5] ) ) {
+                               # No quotes.
+                               $value = Sanitizer::normalizeAttributeValue( $set[5] );
+                       } elseif( isset( $set[4] ) ) {
                                # Single-quoted
                                $value = str_replace( '"', '&quot;',
                                        Sanitizer::normalizeAttributeValue( $set[4] ) );
-                       } elseif( $set[5] != '' ) {
-                               # No quotes.
-                               $value = Sanitizer::normalizeAttributeValue( $set[5] );
-                       } elseif( $set[6] != '' ) {
-                               # Illegal #XXXXXX color with no quotes.
-                               $value = Sanitizer::normalizeAttributeValue( $set[6] );
+                       } elseif( isset( $set[3] ) ) {
+                               # Double-quoted
+                               $value = Sanitizer::normalizeAttributeValue( $set[3] );
+                       } elseif( !isset( $set[2] ) ) {
+                               # In XHTML, attributes must have a value.
+                               $value = $set[1];
                        } else {
-                               wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
+                               wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
                        }
                        
                        # Strip javascript "expression" from stylesheets.
@@ -322,7 +329,10 @@ class Sanitizer {
                        array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
                        $text );
        }
-       
+       /**
+        * @param string $matches
+        * @return string
+        */
        function normalizeCharReferencesCallback( $matches ) {
                $ret = null;
                if( $matches[1] != '' ) {
@@ -346,6 +356,7 @@ class Sanitizer {
         * return the named entity reference as is. Otherwise, returns
         * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
         *
+        * @param string $name
         * @return string
         */
        function normalizeEntity( $name ) {
@@ -784,14 +795,43 @@ class Sanitizer {
                        # 15.3
                        'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
                        
+                       # XHTML Ruby annotation text module, simple ruby only.
+                       # http://www.w3c.org/TR/ruby/
+                       'ruby'       => $common,
+                       # rbc
+                       # rtc
                        'rb'         => $common,
+                       'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
                        'rp'         => $common,
-                       'rt'         => $common,
-                       'ruby'       => $common,
                        );
                return $whitelist;
        }
+       
+       /**
+        * Take a fragment of (potentially invalid) HTML and return
+        * a version with any tags removed, encoded suitably for literal
+        * inclusion in an attribute value.
+        *
+        * @param string $text HTML fragment
+        * @return string
+        */
+       function stripAllTags( $text ) {
+               # Actual <tags>
+               $text = preg_replace( '/<[^>]*>/', '', $text );
+               
+               # Normalize &entities and whitespace
+               $text = Sanitizer::normalizeAttributeValue( $text );
+               
+               # Will be placed into "double-quoted" attributes,
+               # make sure remaining bits are safe.
+               $text = str_replace(
+                       array('<', '>', '"'),
+                       array('&lt;', '&gt;', '&quot;'),
+                       $text );
+               
+               return $text;
+       }
 
 }
 
-?>
\ No newline at end of file
+?>