includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  */
  26
  27 class Sanitizer {
  28         /**
  29          * Cleans up HTML, removes dangerous tags and attributes, and
  30          * removes HTML comments
  31          * @access private
  32          */
  33         function removeHTMLtags( $text ) {
  34                 global $wgUseTidy, $wgUserHtml;
  35                 $fname = 'Parser::removeHTMLtags';
  36                 wfProfileIn( $fname );
  37
  38                 if( $wgUserHtml ) {
  39                         $htmlpairs = array( # Tags that must be closed
  40                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  41                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  42                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
  43                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  44                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
  45                         );
  46                         $htmlsingle = array(
  47                                 'br', 'hr', 'li', 'dt', 'dd'
  48                         );
  49                         $htmlnest = array( # Tags that can be nested--??
  50                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  51                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  52                         );
  53                         $tabletags = array( # Can only appear inside table
  54                                 'td', 'th', 'tr'
  55                         );
  56                 } else {
  57                         $htmlpairs = array();
  58                         $htmlsingle = array();
  59                         $htmlnest = array();
  60                         $tabletags = array();
  61                 }
  62
  63                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
  64                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
  65
  66                 $htmlattrs = Sanitizer::getHTMLattrs () ;
  67
  68                 # Remove HTML comments
  69                 $text = Sanitizer::removeHTMLcomments( $text );
  70
  71                 $bits = explode( '<', $text );
  72                 $text = array_shift( $bits );
  73                 if(!$wgUseTidy) {
  74                         $tagstack = array(); $tablestack = array();
  75                         foreach ( $bits as $x ) {
  76                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
  77                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
  78                                 $x, $regs );
  79                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
  80                                 error_reporting( $prev );
  81
  82                                 $badtag = 0 ;
  83                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
  84                                         # Check our stack
  85                                         if ( $slash ) {
  86                                                 # Closing a tag...
  87                                                 if ( ! in_array( $t, $htmlsingle ) &&
  88                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
  89                                                         @array_push( $tagstack, $ot );
  90                                                         $badtag = 1;
  91                                                 } else {
  92                                                         if ( $t == 'table' ) {
  93                                                                 $tagstack = array_pop( $tablestack );
  94                                                         }
  95                                                         $newparams = '';
  96                                                 }
  97                                         } else {
  98                                                 # Keep track for later
  99                                                 if ( in_array( $t, $tabletags ) &&
 100                                                 ! in_array( 'table', $tagstack ) ) {
 101                                                         $badtag = 1;
 102                                                 } else if ( in_array( $t, $tagstack ) &&
 103                                                 ! in_array ( $t , $htmlnest ) ) {
 104                                                         $badtag = 1 ;
 105                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 106                                                         if ( $t == 'table' ) {
 107                                                                 array_push( $tablestack, $tagstack );
 108                                                                 $tagstack = array();
 109                                                         }
 110                                                         array_push( $tagstack, $t );
 111                                                 }
 112                                                 # Strip non-approved attributes from the tag
 113                                                 $newparams = Sanitizer::fixTagAttributes($params);
 114
 115                                         }
 116                                         if ( ! $badtag ) {
 117                                                 $rest = str_replace( '>', '&gt;', $rest );
 118                                                 $text .= "<$slash$t $newparams$brace$rest";
 119                                                 continue;
 120                                         }
 121                                 }
 122                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 123                         }
 124                         # Close off any remaining tags
 125                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 126                                 $text .= "</$t>\n";
 127                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 128                         }
 129                 } else {
 130                         # this might be possible using tidy itself
 131                         foreach ( $bits as $x ) {
 132                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 133                                 $x, $regs );
 134                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 135                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 136                                         $newparams = Sanitizer::fixTagAttributes($params);
 137                                         $rest = str_replace( '>', '&gt;', $rest );
 138                                         $text .= "<$slash$t $newparams$brace$rest";
 139                                 } else {
 140                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 141                                 }
 142                         }
 143                 }
 144                 wfProfileOut( $fname );
 145                 return $text;
 146         }
 147
 148         /**
 149          * Remove '<!--', '-->', and everything between.
 150          * To avoid leaving blank lines, when a comment is both preceded
 151          * and followed by a newline (ignoring spaces), trim leading and
 152          * trailing spaces and one of the newlines.
 153          *
 154          * @access private
 155          */
 156         function removeHTMLcomments( $text ) {
 157                 $fname='Parser::removeHTMLcomments';
 158                 wfProfileIn( $fname );
 159                 while (($start = strpos($text, '<!--')) !== false) {
 160                         $end = strpos($text, '-->', $start + 4);
 161                         if ($end === false) {
 162                                 # Unterminated comment; bail out
 163                                 break;
 164                         }
 165
 166                         $end += 3;
 167
 168                         # Trim space and newline if the comment is both
 169                         # preceded and followed by a newline
 170                         $spaceStart = max($start - 1, 0);
 171                         $spaceLen = $end - $spaceStart;
 172                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 173                                 $spaceStart--;
 174                                 $spaceLen++;
 175                         }
 176                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 177                                 $spaceLen++;
 178                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 179                                 # Remove the comment, leading and trailing
 180                                 # spaces, and leave only one newline.
 181                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 182                         }
 183                         else {
 184                                 # Remove just the comment.
 185                                 $text = substr_replace($text, '', $start, $end - $start);
 186                         }
 187                 }
 188                 wfProfileOut( $fname );
 189                 return $text;
 190         }
 191
 192         /**
 193          * Return allowed HTML attributes
 194          *
 195          * @access private
 196          */
 197         function getHTMLattrs () {
 198                 $htmlattrs = array( # Allowed attributes--no scripting, etc.
 199                                 'title', 'align', 'lang', 'dir', 'width', 'height',
 200                                 'bgcolor', 'clear', /* BR */ 'noshade', /* HR */
 201                                 'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color',
 202                                 /* FONT */ 'type', 'start', 'value', 'compact',
 203                                 /* For various lists, mostly deprecated but safe */
 204                                 'summary', 'width', 'border', 'frame', 'rules',
 205                                 'cellspacing', 'cellpadding', 'valign', 'char',
 206                                 'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis',
 207                                 'headers', 'scope', 'rowspan', 'colspan', /* Tables */
 208                                 'id', 'class', 'name', 'style' /* For CSS */
 209                                 );
 210                 return $htmlattrs ;
 211         }
 212
 213         /**
 214          * Remove non approved attributes and javascript in css
 215          *
 216          * @access private
 217          */
 218         function fixTagAttributes ( $t ) {
 219                 if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-)
 220                 $htmlattrs = Sanitizer::getHTMLattrs() ;
 221
 222                 # Strip non-approved attributes from the tag
 223                 $t = preg_replace(
 224                         '/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e',
 225                         "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
 226                         $t);
 227
 228                 $t = str_replace ( '<></>' , '' , $t ) ; # This should fix bug 980557
 229
 230                 # Strip javascript "expression" from stylesheets. Brute force approach:
 231                 # If anythin offensive is found, all attributes of the HTML tag are dropped
 232
 233                 if( preg_match(
 234                         '/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is',
 235                         wfMungeToUtf8( $t ) ) )
 236                 {
 237                         $t='';
 238                 }
 239
 240                 return trim ( $t ) ;
 241         }
 242
 243 }
 244
 245 ?>