From 45b6f3ca352fcea546e73b70b299590dccc51ae2 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 6 Feb 2005 06:44:48 +0000 Subject: [PATCH] Split the HTML sanitizer functions from the Parser monolith --- includes/Parser.php | 228 ++------------------------------------ includes/Sanitizer.php | 245 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 252 insertions(+), 221 deletions(-) create mode 100644 includes/Sanitizer.php diff --git a/includes/Parser.php b/includes/Parser.php index dbc770f703..855429c05a 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -6,6 +6,8 @@ * @package MediaWiki */ +require_once( 'Sanitizer.php' ); + /** * Update this version number when the ParserOutput format * changes in an incompatible way, so the parser cache @@ -467,57 +469,6 @@ class Parser return $rnd; } - /** - * Return allowed HTML attributes - * - * @access private - */ - function getHTMLattrs () { - $htmlattrs = array( # Allowed attributes--no scripting, etc. - 'title', 'align', 'lang', 'dir', 'width', 'height', - 'bgcolor', 'clear', /* BR */ 'noshade', /* HR */ - 'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color', - /* FONT */ 'type', 'start', 'value', 'compact', - /* For various lists, mostly deprecated but safe */ - 'summary', 'width', 'border', 'frame', 'rules', - 'cellspacing', 'cellpadding', 'valign', 'char', - 'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis', - 'headers', 'scope', 'rowspan', 'colspan', /* Tables */ - 'id', 'class', 'name', 'style' /* For CSS */ - ); - return $htmlattrs ; - } - - /** - * Remove non approved attributes and javascript in css - * - * @access private - */ - function fixTagAttributes ( $t ) { - if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-) - $htmlattrs = $this->getHTMLattrs() ; - - # Strip non-approved attributes from the tag - $t = preg_replace( - '/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e', - "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')", - $t); - - $t = str_replace ( '<>' , '' , $t ) ; # This should fix bug 980557 - - # Strip javascript "expression" from stylesheets. Brute force approach: - # If anythin offensive is found, all attributes of the HTML tag are dropped - - if( preg_match( - '/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is', - wfMungeToUtf8( $t ) ) ) - { - $t=''; - } - - return trim ( $t ) ; - } - /** * interface with html tidy, used if $wgUseTidy = true * @@ -595,7 +546,7 @@ class Parser $indent_level = strlen( $matches[1] ); $t[$k] = "\n" . str_repeat( '
', $indent_level ) . - 'fixTagAttributes ( $matches[2] ) . '>' ; + '
' ; array_push ( $td , false ) ; array_push ( $ltd , '' ) ; array_push ( $tr , false ) ; @@ -622,7 +573,7 @@ class Parser array_push ( $tr , false ) ; array_push ( $td , false ) ; array_push ( $ltd , '' ) ; - array_push ( $ltr , $this->fixTagAttributes ( $x ) ) ; + array_push ( $ltr , Sanitizer::fixTagAttributes ( $x ) ) ; } else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption # $x is a table row @@ -664,7 +615,7 @@ class Parser } if ( count ( $y ) == 1 ) $y = "{$z}<{$l}>{$y[0]}" ; - else $y = $y = "{$z}<{$l} ".$this->fixTagAttributes($y[0]).">{$y[1]}" ; + else $y = $y = "{$z}<{$l} ".Sanitizer::fixTagAttributes($y[0]).">{$y[1]}" ; $t[$k] .= $y ; array_push ( $td , true ) ; } @@ -697,7 +648,7 @@ class Parser $fname = 'Parser::internalParse'; wfProfileIn( $fname ); - $text = $this->removeHTMLtags( $text ); + $text = Sanitizer::removeHTMLtags( $text ); $text = $this->replaceVariables( $text, $args ); $text = preg_replace( '/(^|\n)-----*/', '\\1
', $text ); @@ -2118,7 +2069,7 @@ class Parser $this->mTemplatePath[$part1] = 1; $text = $this->strip( $text, $this->mStripState ); - $text = $this->removeHTMLtags( $text ); + $text = Sanitizer::removeHTMLtags( $text ); $text = $this->replaceVariables( $text, $assocArgs ); # Resume the link cache and register the inclusion as a link @@ -2210,171 +2161,6 @@ class Parser } } - - /** - * Cleans up HTML, removes dangerous tags and attributes, and - * removes HTML comments - * @access private - */ - function removeHTMLtags( $text ) { - global $wgUseTidy, $wgUserHtml; - $fname = 'Parser::removeHTMLtags'; - wfProfileIn( $fname ); - - if( $wgUserHtml ) { - $htmlpairs = array( # Tags that must be closed - 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', - 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', - 'strike', 'strong', 'tt', 'var', 'div', 'center', - 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span' - ); - $htmlsingle = array( - 'br', 'hr', 'li', 'dt', 'dd' - ); - $htmlnest = array( # Tags that can be nested--?? - 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', - 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' - ); - $tabletags = array( # Can only appear inside table - 'td', 'th', 'tr' - ); - } else { - $htmlpairs = array(); - $htmlsingle = array(); - $htmlnest = array(); - $tabletags = array(); - } - - $htmlsingle = array_merge( $tabletags, $htmlsingle ); - $htmlelements = array_merge( $htmlsingle, $htmlpairs ); - - $htmlattrs = $this->getHTMLattrs () ; - - # Remove HTML comments - $text = $this->removeHTMLcomments( $text ); - - $bits = explode( '<', $text ); - $text = array_shift( $bits ); - if(!$wgUseTidy) { - $tagstack = array(); $tablestack = array(); - foreach ( $bits as $x ) { - $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); - preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/', - $x, $regs ); - list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; - error_reporting( $prev ); - - $badtag = 0 ; - if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { - # Check our stack - if ( $slash ) { - # Closing a tag... - if ( ! in_array( $t, $htmlsingle ) && - ( $ot = @array_pop( $tagstack ) ) != $t ) { - @array_push( $tagstack, $ot ); - $badtag = 1; - } else { - if ( $t == 'table' ) { - $tagstack = array_pop( $tablestack ); - } - $newparams = ''; - } - } else { - # Keep track for later - if ( in_array( $t, $tabletags ) && - ! in_array( 'table', $tagstack ) ) { - $badtag = 1; - } else if ( in_array( $t, $tagstack ) && - ! in_array ( $t , $htmlnest ) ) { - $badtag = 1 ; - } else if ( ! in_array( $t, $htmlsingle ) ) { - if ( $t == 'table' ) { - array_push( $tablestack, $tagstack ); - $tagstack = array(); - } - array_push( $tagstack, $t ); - } - # Strip non-approved attributes from the tag - $newparams = $this->fixTagAttributes($params); - - } - if ( ! $badtag ) { - $rest = str_replace( '>', '>', $rest ); - $text .= "<$slash$t $newparams$brace$rest"; - continue; - } - } - $text .= '<' . str_replace( '>', '>', $x); - } - # Close off any remaining tags - while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { - $text .= "\n"; - if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } - } - } else { - # this might be possible using tidy itself - foreach ( $bits as $x ) { - preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/', - $x, $regs ); - @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; - if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { - $newparams = $this->fixTagAttributes($params); - $rest = str_replace( '>', '>', $rest ); - $text .= "<$slash$t $newparams$brace$rest"; - } else { - $text .= '<' . str_replace( '>', '>', $x); - } - } - } - wfProfileOut( $fname ); - return $text; - } - - /** - * Remove '', and everything between. - * To avoid leaving blank lines, when a comment is both preceded - * and followed by a newline (ignoring spaces), trim leading and - * trailing spaces and one of the newlines. - * - * @access private - */ - function removeHTMLcomments( $text ) { - $fname='Parser::removeHTMLcomments'; - wfProfileIn( $fname ); - while (($start = strpos($text, '', $start + 4); - if ($end === false) { - # Unterminated comment; bail out - break; - } - - $end += 3; - - # Trim space and newline if the comment is both - # preceded and followed by a newline - $spaceStart = max($start - 1, 0); - $spaceLen = $end - $spaceStart; - while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) { - $spaceStart--; - $spaceLen++; - } - while (substr($text, $spaceStart + $spaceLen, 1) === ' ') - $spaceLen++; - if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") { - # Remove the comment, leading and trailing - # spaces, and leave only one newline. - $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1); - } - else { - # Remove just the comment. - $text = substr_replace($text, '', $start, $end - $start); - } - } - wfProfileOut( $fname ); - return $text; - } - /** * This function accomplishes several tasks: * 1) Auto-number headings if that option is enabled diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php new file mode 100644 index 0000000000..dab3c8ce4f --- /dev/null +++ b/includes/Sanitizer.php @@ -0,0 +1,245 @@ + et al + * http://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @package MediaWiki + */ + +class Sanitizer { + /** + * Cleans up HTML, removes dangerous tags and attributes, and + * removes HTML comments + * @access private + */ + function removeHTMLtags( $text ) { + global $wgUseTidy, $wgUserHtml; + $fname = 'Parser::removeHTMLtags'; + wfProfileIn( $fname ); + + if( $wgUserHtml ) { + $htmlpairs = array( # Tags that must be closed + 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', + 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', + 'strike', 'strong', 'tt', 'var', 'div', 'center', + 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span' + ); + $htmlsingle = array( + 'br', 'hr', 'li', 'dt', 'dd' + ); + $htmlnest = array( # Tags that can be nested--?? + 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', + 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' + ); + $tabletags = array( # Can only appear inside table + 'td', 'th', 'tr' + ); + } else { + $htmlpairs = array(); + $htmlsingle = array(); + $htmlnest = array(); + $tabletags = array(); + } + + $htmlsingle = array_merge( $tabletags, $htmlsingle ); + $htmlelements = array_merge( $htmlsingle, $htmlpairs ); + + $htmlattrs = Sanitizer::getHTMLattrs () ; + + # Remove HTML comments + $text = Sanitizer::removeHTMLcomments( $text ); + + $bits = explode( '<', $text ); + $text = array_shift( $bits ); + if(!$wgUseTidy) { + $tagstack = array(); $tablestack = array(); + foreach ( $bits as $x ) { + $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); + preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/', + $x, $regs ); + list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; + error_reporting( $prev ); + + $badtag = 0 ; + if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { + # Check our stack + if ( $slash ) { + # Closing a tag... + if ( ! in_array( $t, $htmlsingle ) && + ( $ot = @array_pop( $tagstack ) ) != $t ) { + @array_push( $tagstack, $ot ); + $badtag = 1; + } else { + if ( $t == 'table' ) { + $tagstack = array_pop( $tablestack ); + } + $newparams = ''; + } + } else { + # Keep track for later + if ( in_array( $t, $tabletags ) && + ! in_array( 'table', $tagstack ) ) { + $badtag = 1; + } else if ( in_array( $t, $tagstack ) && + ! in_array ( $t , $htmlnest ) ) { + $badtag = 1 ; + } else if ( ! in_array( $t, $htmlsingle ) ) { + if ( $t == 'table' ) { + array_push( $tablestack, $tagstack ); + $tagstack = array(); + } + array_push( $tagstack, $t ); + } + # Strip non-approved attributes from the tag + $newparams = Sanitizer::fixTagAttributes($params); + + } + if ( ! $badtag ) { + $rest = str_replace( '>', '>', $rest ); + $text .= "<$slash$t $newparams$brace$rest"; + continue; + } + } + $text .= '<' . str_replace( '>', '>', $x); + } + # Close off any remaining tags + while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { + $text .= "\n"; + if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } + } + } else { + # this might be possible using tidy itself + foreach ( $bits as $x ) { + preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/', + $x, $regs ); + @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; + if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { + $newparams = Sanitizer::fixTagAttributes($params); + $rest = str_replace( '>', '>', $rest ); + $text .= "<$slash$t $newparams$brace$rest"; + } else { + $text .= '<' . str_replace( '>', '>', $x); + } + } + } + wfProfileOut( $fname ); + return $text; + } + + /** + * Remove '', and everything between. + * To avoid leaving blank lines, when a comment is both preceded + * and followed by a newline (ignoring spaces), trim leading and + * trailing spaces and one of the newlines. + * + * @access private + */ + function removeHTMLcomments( $text ) { + $fname='Parser::removeHTMLcomments'; + wfProfileIn( $fname ); + while (($start = strpos($text, '', $start + 4); + if ($end === false) { + # Unterminated comment; bail out + break; + } + + $end += 3; + + # Trim space and newline if the comment is both + # preceded and followed by a newline + $spaceStart = max($start - 1, 0); + $spaceLen = $end - $spaceStart; + while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) { + $spaceStart--; + $spaceLen++; + } + while (substr($text, $spaceStart + $spaceLen, 1) === ' ') + $spaceLen++; + if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") { + # Remove the comment, leading and trailing + # spaces, and leave only one newline. + $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1); + } + else { + # Remove just the comment. + $text = substr_replace($text, '', $start, $end - $start); + } + } + wfProfileOut( $fname ); + return $text; + } + + /** + * Return allowed HTML attributes + * + * @access private + */ + function getHTMLattrs () { + $htmlattrs = array( # Allowed attributes--no scripting, etc. + 'title', 'align', 'lang', 'dir', 'width', 'height', + 'bgcolor', 'clear', /* BR */ 'noshade', /* HR */ + 'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color', + /* FONT */ 'type', 'start', 'value', 'compact', + /* For various lists, mostly deprecated but safe */ + 'summary', 'width', 'border', 'frame', 'rules', + 'cellspacing', 'cellpadding', 'valign', 'char', + 'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis', + 'headers', 'scope', 'rowspan', 'colspan', /* Tables */ + 'id', 'class', 'name', 'style' /* For CSS */ + ); + return $htmlattrs ; + } + + /** + * Remove non approved attributes and javascript in css + * + * @access private + */ + function fixTagAttributes ( $t ) { + if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-) + $htmlattrs = Sanitizer::getHTMLattrs() ; + + # Strip non-approved attributes from the tag + $t = preg_replace( + '/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e', + "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')", + $t); + + $t = str_replace ( '<>' , '' , $t ) ; # This should fix bug 980557 + + # Strip javascript "expression" from stylesheets. Brute force approach: + # If anythin offensive is found, all attributes of the HTML tag are dropped + + if( preg_match( + '/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is', + wfMungeToUtf8( $t ) ) ) + { + $t=''; + } + + return trim ( $t ) ; + } + +} + +?> \ No newline at end of file -- 2.20.1