Split the HTML sanitizer functions from the Parser monolith
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2
3 /**
4 * (X)HTML sanitizer for MediaWiki
5 *
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
23 *
24 * @package MediaWiki
25 */
26
27 class Sanitizer {
28 /**
29 * Cleans up HTML, removes dangerous tags and attributes, and
30 * removes HTML comments
31 * @access private
32 */
33 function removeHTMLtags( $text ) {
34 global $wgUseTidy, $wgUserHtml;
35 $fname = 'Parser::removeHTMLtags';
36 wfProfileIn( $fname );
37
38 if( $wgUserHtml ) {
39 $htmlpairs = array( # Tags that must be closed
40 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
41 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
42 'strike', 'strong', 'tt', 'var', 'div', 'center',
43 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
44 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
45 );
46 $htmlsingle = array(
47 'br', 'hr', 'li', 'dt', 'dd'
48 );
49 $htmlnest = array( # Tags that can be nested--??
50 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
51 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
52 );
53 $tabletags = array( # Can only appear inside table
54 'td', 'th', 'tr'
55 );
56 } else {
57 $htmlpairs = array();
58 $htmlsingle = array();
59 $htmlnest = array();
60 $tabletags = array();
61 }
62
63 $htmlsingle = array_merge( $tabletags, $htmlsingle );
64 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
65
66 $htmlattrs = Sanitizer::getHTMLattrs () ;
67
68 # Remove HTML comments
69 $text = Sanitizer::removeHTMLcomments( $text );
70
71 $bits = explode( '<', $text );
72 $text = array_shift( $bits );
73 if(!$wgUseTidy) {
74 $tagstack = array(); $tablestack = array();
75 foreach ( $bits as $x ) {
76 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
77 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
78 $x, $regs );
79 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
80 error_reporting( $prev );
81
82 $badtag = 0 ;
83 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
84 # Check our stack
85 if ( $slash ) {
86 # Closing a tag...
87 if ( ! in_array( $t, $htmlsingle ) &&
88 ( $ot = @array_pop( $tagstack ) ) != $t ) {
89 @array_push( $tagstack, $ot );
90 $badtag = 1;
91 } else {
92 if ( $t == 'table' ) {
93 $tagstack = array_pop( $tablestack );
94 }
95 $newparams = '';
96 }
97 } else {
98 # Keep track for later
99 if ( in_array( $t, $tabletags ) &&
100 ! in_array( 'table', $tagstack ) ) {
101 $badtag = 1;
102 } else if ( in_array( $t, $tagstack ) &&
103 ! in_array ( $t , $htmlnest ) ) {
104 $badtag = 1 ;
105 } else if ( ! in_array( $t, $htmlsingle ) ) {
106 if ( $t == 'table' ) {
107 array_push( $tablestack, $tagstack );
108 $tagstack = array();
109 }
110 array_push( $tagstack, $t );
111 }
112 # Strip non-approved attributes from the tag
113 $newparams = Sanitizer::fixTagAttributes($params);
114
115 }
116 if ( ! $badtag ) {
117 $rest = str_replace( '>', '&gt;', $rest );
118 $text .= "<$slash$t $newparams$brace$rest";
119 continue;
120 }
121 }
122 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
123 }
124 # Close off any remaining tags
125 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
126 $text .= "</$t>\n";
127 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
128 }
129 } else {
130 # this might be possible using tidy itself
131 foreach ( $bits as $x ) {
132 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
133 $x, $regs );
134 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
135 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
136 $newparams = Sanitizer::fixTagAttributes($params);
137 $rest = str_replace( '>', '&gt;', $rest );
138 $text .= "<$slash$t $newparams$brace$rest";
139 } else {
140 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
141 }
142 }
143 }
144 wfProfileOut( $fname );
145 return $text;
146 }
147
148 /**
149 * Remove '<!--', '-->', and everything between.
150 * To avoid leaving blank lines, when a comment is both preceded
151 * and followed by a newline (ignoring spaces), trim leading and
152 * trailing spaces and one of the newlines.
153 *
154 * @access private
155 */
156 function removeHTMLcomments( $text ) {
157 $fname='Parser::removeHTMLcomments';
158 wfProfileIn( $fname );
159 while (($start = strpos($text, '<!--')) !== false) {
160 $end = strpos($text, '-->', $start + 4);
161 if ($end === false) {
162 # Unterminated comment; bail out
163 break;
164 }
165
166 $end += 3;
167
168 # Trim space and newline if the comment is both
169 # preceded and followed by a newline
170 $spaceStart = max($start - 1, 0);
171 $spaceLen = $end - $spaceStart;
172 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
173 $spaceStart--;
174 $spaceLen++;
175 }
176 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
177 $spaceLen++;
178 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
179 # Remove the comment, leading and trailing
180 # spaces, and leave only one newline.
181 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
182 }
183 else {
184 # Remove just the comment.
185 $text = substr_replace($text, '', $start, $end - $start);
186 }
187 }
188 wfProfileOut( $fname );
189 return $text;
190 }
191
192 /**
193 * Return allowed HTML attributes
194 *
195 * @access private
196 */
197 function getHTMLattrs () {
198 $htmlattrs = array( # Allowed attributes--no scripting, etc.
199 'title', 'align', 'lang', 'dir', 'width', 'height',
200 'bgcolor', 'clear', /* BR */ 'noshade', /* HR */
201 'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color',
202 /* FONT */ 'type', 'start', 'value', 'compact',
203 /* For various lists, mostly deprecated but safe */
204 'summary', 'width', 'border', 'frame', 'rules',
205 'cellspacing', 'cellpadding', 'valign', 'char',
206 'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis',
207 'headers', 'scope', 'rowspan', 'colspan', /* Tables */
208 'id', 'class', 'name', 'style' /* For CSS */
209 );
210 return $htmlattrs ;
211 }
212
213 /**
214 * Remove non approved attributes and javascript in css
215 *
216 * @access private
217 */
218 function fixTagAttributes ( $t ) {
219 if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-)
220 $htmlattrs = Sanitizer::getHTMLattrs() ;
221
222 # Strip non-approved attributes from the tag
223 $t = preg_replace(
224 '/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e',
225 "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
226 $t);
227
228 $t = str_replace ( '<></>' , '' , $t ) ; # This should fix bug 980557
229
230 # Strip javascript "expression" from stylesheets. Brute force approach:
231 # If anythin offensive is found, all attributes of the HTML tag are dropped
232
233 if( preg_match(
234 '/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is',
235 wfMungeToUtf8( $t ) ) )
236 {
237 $t='';
238 }
239
240 return trim ( $t ) ;
241 }
242
243 }
244
245 ?>