Fix bug 5497: Regression in HTML normalization in 1.6 (unclosed <li>)
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2 /**
3 * XHTML sanitizer for MediaWiki
4 *
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @package MediaWiki
24 * @subpackage Parser
25 */
26
27 /**
28 * Regular expression to match various types of character references in
29 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
30 */
31 define( 'MW_CHAR_REFS_REGEX',
32 '/&([A-Za-z0-9]+);
33 |&\#([0-9]+);
34 |&\#x([0-9A-Za-z]+);
35 |&\#X([0-9A-Za-z]+);
36 |(&)/x' );
37
38 /**
39 * Regular expression to match HTML/XML attribute pairs within a tag.
40 * Allows some... latitude.
41 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
42 */
43 $attrib = '[A-Za-z0-9]';
44 $space = '[\x09\x0a\x0d\x20]';
45 define( 'MW_ATTRIBS_REGEX',
46 "/(?:^|$space)($attrib+)
47 ($space*=$space*
48 (?:
49 # The attribute value: quoted or alone
50 \"([^<\"]*)\"
51 | '([^<']*)'
52 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
53 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
54 # colors are specified like this.
55 # We'll be normalizing it.
56 )
57 )?(?=$space|\$)/sx" );
58
59 /**
60 * List of all named character entities defined in HTML 4.01
61 * http://www.w3.org/TR/html4/sgml/entities.html
62 * @private
63 */
64 global $wgHtmlEntities;
65 $wgHtmlEntities = array(
66 'Aacute' => 193,
67 'aacute' => 225,
68 'Acirc' => 194,
69 'acirc' => 226,
70 'acute' => 180,
71 'AElig' => 198,
72 'aelig' => 230,
73 'Agrave' => 192,
74 'agrave' => 224,
75 'alefsym' => 8501,
76 'Alpha' => 913,
77 'alpha' => 945,
78 'amp' => 38,
79 'and' => 8743,
80 'ang' => 8736,
81 'Aring' => 197,
82 'aring' => 229,
83 'asymp' => 8776,
84 'Atilde' => 195,
85 'atilde' => 227,
86 'Auml' => 196,
87 'auml' => 228,
88 'bdquo' => 8222,
89 'Beta' => 914,
90 'beta' => 946,
91 'brvbar' => 166,
92 'bull' => 8226,
93 'cap' => 8745,
94 'Ccedil' => 199,
95 'ccedil' => 231,
96 'cedil' => 184,
97 'cent' => 162,
98 'Chi' => 935,
99 'chi' => 967,
100 'circ' => 710,
101 'clubs' => 9827,
102 'cong' => 8773,
103 'copy' => 169,
104 'crarr' => 8629,
105 'cup' => 8746,
106 'curren' => 164,
107 'dagger' => 8224,
108 'Dagger' => 8225,
109 'darr' => 8595,
110 'dArr' => 8659,
111 'deg' => 176,
112 'Delta' => 916,
113 'delta' => 948,
114 'diams' => 9830,
115 'divide' => 247,
116 'Eacute' => 201,
117 'eacute' => 233,
118 'Ecirc' => 202,
119 'ecirc' => 234,
120 'Egrave' => 200,
121 'egrave' => 232,
122 'empty' => 8709,
123 'emsp' => 8195,
124 'ensp' => 8194,
125 'Epsilon' => 917,
126 'epsilon' => 949,
127 'equiv' => 8801,
128 'Eta' => 919,
129 'eta' => 951,
130 'ETH' => 208,
131 'eth' => 240,
132 'Euml' => 203,
133 'euml' => 235,
134 'euro' => 8364,
135 'exist' => 8707,
136 'fnof' => 402,
137 'forall' => 8704,
138 'frac12' => 189,
139 'frac14' => 188,
140 'frac34' => 190,
141 'frasl' => 8260,
142 'Gamma' => 915,
143 'gamma' => 947,
144 'ge' => 8805,
145 'gt' => 62,
146 'harr' => 8596,
147 'hArr' => 8660,
148 'hearts' => 9829,
149 'hellip' => 8230,
150 'Iacute' => 205,
151 'iacute' => 237,
152 'Icirc' => 206,
153 'icirc' => 238,
154 'iexcl' => 161,
155 'Igrave' => 204,
156 'igrave' => 236,
157 'image' => 8465,
158 'infin' => 8734,
159 'int' => 8747,
160 'Iota' => 921,
161 'iota' => 953,
162 'iquest' => 191,
163 'isin' => 8712,
164 'Iuml' => 207,
165 'iuml' => 239,
166 'Kappa' => 922,
167 'kappa' => 954,
168 'Lambda' => 923,
169 'lambda' => 955,
170 'lang' => 9001,
171 'laquo' => 171,
172 'larr' => 8592,
173 'lArr' => 8656,
174 'lceil' => 8968,
175 'ldquo' => 8220,
176 'le' => 8804,
177 'lfloor' => 8970,
178 'lowast' => 8727,
179 'loz' => 9674,
180 'lrm' => 8206,
181 'lsaquo' => 8249,
182 'lsquo' => 8216,
183 'lt' => 60,
184 'macr' => 175,
185 'mdash' => 8212,
186 'micro' => 181,
187 'middot' => 183,
188 'minus' => 8722,
189 'Mu' => 924,
190 'mu' => 956,
191 'nabla' => 8711,
192 'nbsp' => 160,
193 'ndash' => 8211,
194 'ne' => 8800,
195 'ni' => 8715,
196 'not' => 172,
197 'notin' => 8713,
198 'nsub' => 8836,
199 'Ntilde' => 209,
200 'ntilde' => 241,
201 'Nu' => 925,
202 'nu' => 957,
203 'Oacute' => 211,
204 'oacute' => 243,
205 'Ocirc' => 212,
206 'ocirc' => 244,
207 'OElig' => 338,
208 'oelig' => 339,
209 'Ograve' => 210,
210 'ograve' => 242,
211 'oline' => 8254,
212 'Omega' => 937,
213 'omega' => 969,
214 'Omicron' => 927,
215 'omicron' => 959,
216 'oplus' => 8853,
217 'or' => 8744,
218 'ordf' => 170,
219 'ordm' => 186,
220 'Oslash' => 216,
221 'oslash' => 248,
222 'Otilde' => 213,
223 'otilde' => 245,
224 'otimes' => 8855,
225 'Ouml' => 214,
226 'ouml' => 246,
227 'para' => 182,
228 'part' => 8706,
229 'permil' => 8240,
230 'perp' => 8869,
231 'Phi' => 934,
232 'phi' => 966,
233 'Pi' => 928,
234 'pi' => 960,
235 'piv' => 982,
236 'plusmn' => 177,
237 'pound' => 163,
238 'prime' => 8242,
239 'Prime' => 8243,
240 'prod' => 8719,
241 'prop' => 8733,
242 'Psi' => 936,
243 'psi' => 968,
244 'quot' => 34,
245 'radic' => 8730,
246 'rang' => 9002,
247 'raquo' => 187,
248 'rarr' => 8594,
249 'rArr' => 8658,
250 'rceil' => 8969,
251 'rdquo' => 8221,
252 'real' => 8476,
253 'reg' => 174,
254 'rfloor' => 8971,
255 'Rho' => 929,
256 'rho' => 961,
257 'rlm' => 8207,
258 'rsaquo' => 8250,
259 'rsquo' => 8217,
260 'sbquo' => 8218,
261 'Scaron' => 352,
262 'scaron' => 353,
263 'sdot' => 8901,
264 'sect' => 167,
265 'shy' => 173,
266 'Sigma' => 931,
267 'sigma' => 963,
268 'sigmaf' => 962,
269 'sim' => 8764,
270 'spades' => 9824,
271 'sub' => 8834,
272 'sube' => 8838,
273 'sum' => 8721,
274 'sup' => 8835,
275 'sup1' => 185,
276 'sup2' => 178,
277 'sup3' => 179,
278 'supe' => 8839,
279 'szlig' => 223,
280 'Tau' => 932,
281 'tau' => 964,
282 'there4' => 8756,
283 'Theta' => 920,
284 'theta' => 952,
285 'thetasym' => 977,
286 'thinsp' => 8201,
287 'THORN' => 222,
288 'thorn' => 254,
289 'tilde' => 732,
290 'times' => 215,
291 'trade' => 8482,
292 'Uacute' => 218,
293 'uacute' => 250,
294 'uarr' => 8593,
295 'uArr' => 8657,
296 'Ucirc' => 219,
297 'ucirc' => 251,
298 'Ugrave' => 217,
299 'ugrave' => 249,
300 'uml' => 168,
301 'upsih' => 978,
302 'Upsilon' => 933,
303 'upsilon' => 965,
304 'Uuml' => 220,
305 'uuml' => 252,
306 'weierp' => 8472,
307 'Xi' => 926,
308 'xi' => 958,
309 'Yacute' => 221,
310 'yacute' => 253,
311 'yen' => 165,
312 'Yuml' => 376,
313 'yuml' => 255,
314 'Zeta' => 918,
315 'zeta' => 950,
316 'zwj' => 8205,
317 'zwnj' => 8204 );
318
319 /** @package MediaWiki */
320 class Sanitizer {
321 /**
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
324 * @private
325 * @param string $text
326 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
327 * @param array $args for the processing callback
328 * @return string
329 */
330 function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
331 global $wgUseTidy, $wgUserHtml;
332 $fname = 'Parser::removeHTMLtags';
333 wfProfileIn( $fname );
334
335 if( $wgUserHtml ) {
336 $htmlpairs = array( # Tags that must be closed
337 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
338 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
339 'strike', 'strong', 'tt', 'var', 'div', 'center',
340 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
341 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
342 );
343 $htmlsingle = array(
344 'br', 'hr', 'li', 'dt', 'dd'
345 );
346 $htmlsingleonly = array( # Elements that cannot have close tags
347 'br', 'hr'
348 );
349 $htmlnest = array( # Tags that can be nested--??
350 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
351 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
352 );
353 $tabletags = array( # Can only appear inside table
354 'td', 'th', 'tr'
355 );
356 } else {
357 $htmlpairs = array();
358 $htmlsingle = array();
359 $htmlnest = array();
360 $tabletags = array();
361 }
362
363 $htmlsingle = array_merge( $tabletags, $htmlsingle );
364 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
365
366 # Remove HTML comments
367 $text = Sanitizer::removeHTMLcomments( $text );
368
369 $bits = explode( '<', $text );
370 $text = array_shift( $bits );
371 if(!$wgUseTidy) {
372 $tagstack = array(); $tablestack = array();
373 foreach ( $bits as $x ) {
374 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
375 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
376 $x, $regs );
377 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
378 error_reporting( $prev );
379
380 $badtag = 0 ;
381 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
382 # Check our stack
383 if ( $slash ) {
384 # Closing a tag...
385 if( in_array( $t, $htmlsingleonly ) ) {
386 $badtag = 1;
387 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
388 @array_push( $tagstack, $ot );
389 $badtag = 1;
390 } else {
391 if ( $t == 'table' ) {
392 $tagstack = array_pop( $tablestack );
393 }
394 $newparams = '';
395 }
396 } else {
397 # Keep track for later
398 if ( in_array( $t, $tabletags ) &&
399 ! in_array( 'table', $tagstack ) ) {
400 $badtag = 1;
401 } else if ( in_array( $t, $tagstack ) &&
402 ! in_array ( $t , $htmlnest ) ) {
403 $badtag = 1 ;
404 } elseif( in_array( $t, $htmlsingleonly ) ) {
405 # Hack to force empty tag for uncloseable elements
406 $brace = '/>';
407 } else if( in_array( $t, $htmlsingle ) ) {
408 # Hack to not close $htmlsingle tags
409 $brace = NULL;
410 } else {
411 if ( $t == 'table' ) {
412 array_push( $tablestack, $tagstack );
413 $tagstack = array();
414 }
415 array_push( $tagstack, $t );
416 }
417
418 # Replace any variables or template parameters with
419 # plaintext results.
420 if( is_callable( $processCallback ) ) {
421 call_user_func_array( $processCallback, array( &$params, $args ) );
422 }
423
424 # Strip non-approved attributes from the tag
425 $newparams = Sanitizer::fixTagAttributes( $params, $t );
426 }
427 if ( ! $badtag ) {
428 $rest = str_replace( '>', '&gt;', $rest );
429 $close = ( $brace == '/>' ) ? ' /' : '';
430 $text .= "<$slash$t$newparams$close>$rest";
431 continue;
432 }
433 }
434 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
435 }
436 # Close off any remaining tags
437 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
438 $text .= "</$t>\n";
439 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
440 }
441 } else {
442 # this might be possible using tidy itself
443 foreach ( $bits as $x ) {
444 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
445 $x, $regs );
446 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
447 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
448 if( is_callable( $processCallback ) ) {
449 call_user_func_array( $processCallback, array( &$params, $args ) );
450 }
451 $newparams = Sanitizer::fixTagAttributes( $params, $t );
452 $rest = str_replace( '>', '&gt;', $rest );
453 $text .= "<$slash$t$newparams$brace$rest";
454 } else {
455 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
456 }
457 }
458 }
459 wfProfileOut( $fname );
460 return $text;
461 }
462
463 /**
464 * Remove '<!--', '-->', and everything between.
465 * To avoid leaving blank lines, when a comment is both preceded
466 * and followed by a newline (ignoring spaces), trim leading and
467 * trailing spaces and one of the newlines.
468 *
469 * @private
470 * @param string $text
471 * @return string
472 */
473 function removeHTMLcomments( $text ) {
474 $fname='Parser::removeHTMLcomments';
475 wfProfileIn( $fname );
476 while (($start = strpos($text, '<!--')) !== false) {
477 $end = strpos($text, '-->', $start + 4);
478 if ($end === false) {
479 # Unterminated comment; bail out
480 break;
481 }
482
483 $end += 3;
484
485 # Trim space and newline if the comment is both
486 # preceded and followed by a newline
487 $spaceStart = max($start - 1, 0);
488 $spaceLen = $end - $spaceStart;
489 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
490 $spaceStart--;
491 $spaceLen++;
492 }
493 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
494 $spaceLen++;
495 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
496 # Remove the comment, leading and trailing
497 # spaces, and leave only one newline.
498 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
499 }
500 else {
501 # Remove just the comment.
502 $text = substr_replace($text, '', $start, $end - $start);
503 }
504 }
505 wfProfileOut( $fname );
506 return $text;
507 }
508
509 /**
510 * Take a tag soup fragment listing an HTML element's attributes
511 * and normalize it to well-formed XML, discarding unwanted attributes.
512 *
513 * - Normalizes attribute names to lowercase
514 * - Discards attributes not on a whitelist for the given element
515 * - Turns broken or invalid entities into plaintext
516 * - Double-quotes all attribute values
517 * - Attributes without values are given the name as attribute
518 * - Double attributes are discarded
519 * - Unsafe style attributes are discarded
520 * - Prepends space if there are attributes.
521 *
522 * @param string $text
523 * @param string $element
524 * @return string
525 *
526 * @todo Check for legal values where the DTD limits things.
527 * @todo Check for unique id attribute :P
528 */
529 function fixTagAttributes( $text, $element ) {
530 if( trim( $text ) == '' ) {
531 return '';
532 }
533
534 # Unquoted attribute
535 # Since we quote this later, this can be anything distinguishable
536 # from the end of the attribute
537 $pairs = array();
538 if( !preg_match_all(
539 MW_ATTRIBS_REGEX,
540 $text,
541 $pairs,
542 PREG_SET_ORDER ) ) {
543 return '';
544 }
545
546 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
547 $attribs = array();
548 foreach( $pairs as $set ) {
549 $attribute = strtolower( $set[1] );
550 if( !isset( $whitelist[$attribute] ) ) {
551 continue;
552 }
553
554 $raw = Sanitizer::getTagAttributeCallback( $set );
555 $value = Sanitizer::normalizeAttributeValue( $raw );
556
557 # Strip javascript "expression" from stylesheets.
558 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
559 if( $attribute == 'style' ) {
560 $stripped = Sanitizer::decodeCharReferences( $value );
561
562 // Remove any comments; IE gets token splitting wrong
563 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
564 $value = htmlspecialchars( $stripped );
565
566 // ... and continue checks
567 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
568 'codepointToUtf8(hexdec("$1"))', $stripped );
569 $stripped = str_replace( '\\', '', $stripped );
570 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
571 $stripped ) ) {
572 # haxx0r
573 continue;
574 }
575 }
576
577 if ( $attribute === 'id' )
578 $value = Sanitizer::escapeId( $value );
579
580 # Templates and links may be expanded in later parsing,
581 # creating invalid or dangerous output. Suppress this.
582 $value = strtr( $value, array(
583 '<' => '&lt;', // This should never happen,
584 '>' => '&gt;', // we've received invalid input
585 '"' => '&quot;', // which should have been escaped.
586 '{' => '&#123;',
587 '[' => '&#91;',
588 "''" => '&#39;&#39;',
589 'ISBN' => '&#73;SBN',
590 'RFC' => '&#82;FC',
591 'PMID' => '&#80;MID',
592 ) );
593
594 # Stupid hack
595 $value = preg_replace_callback(
596 '/(' . wfUrlProtocols() . ')/',
597 array( 'Sanitizer', 'armorLinksCallback' ),
598 $value );
599
600 // If this attribute was previously set, override it.
601 // Output should only have one attribute of each name.
602 $attribs[$attribute] = "$attribute=\"$value\"";
603 }
604
605 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
606 }
607
608 /**
609 * Given a value escape it so that it can be used in an id attribute and
610 * return it, this does not validate the value however (see first link)
611 *
612 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
613 * in the id and
614 * name attributes
615 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
616 *
617 * @bug 4461
618 *
619 * @static
620 *
621 * @param string $id
622 * @return string
623 */
624 function escapeId( $id ) {
625 static $replace = array(
626 '%3A' => ':',
627 '%' => '.'
628 );
629
630 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
631
632 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
633 }
634
635 /**
636 * Regex replace callback for armoring links against further processing.
637 * @param array $matches
638 * @return string
639 * @private
640 */
641 function armorLinksCallback( $matches ) {
642 return str_replace( ':', '&#58;', $matches[1] );
643 }
644
645 /**
646 * Return an associative array of attribute names and values from
647 * a partial tag string. Attribute names are forces to lowercase,
648 * character references are decoded to UTF-8 text.
649 *
650 * @param string
651 * @return array
652 */
653 function decodeTagAttributes( $text ) {
654 $attribs = array();
655
656 if( trim( $text ) == '' ) {
657 return $attribs;
658 }
659
660 $pairs = array();
661 if( !preg_match_all(
662 MW_ATTRIBS_REGEX,
663 $text,
664 $pairs,
665 PREG_SET_ORDER ) ) {
666 return $attribs;
667 }
668
669 foreach( $pairs as $set ) {
670 $attribute = strtolower( $set[1] );
671 $value = Sanitizer::getTagAttributeCallback( $set );
672 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
673 }
674 return $attribs;
675 }
676
677 /**
678 * Pick the appropriate attribute value from a match set from the
679 * MW_ATTRIBS_REGEX matches.
680 *
681 * @param array $set
682 * @return string
683 * @private
684 */
685 function getTagAttributeCallback( $set ) {
686 if( isset( $set[6] ) ) {
687 # Illegal #XXXXXX color with no quotes.
688 return $set[6];
689 } elseif( isset( $set[5] ) ) {
690 # No quotes.
691 return $set[5];
692 } elseif( isset( $set[4] ) ) {
693 # Single-quoted
694 return $set[4];
695 } elseif( isset( $set[3] ) ) {
696 # Double-quoted
697 return $set[3];
698 } elseif( !isset( $set[2] ) ) {
699 # In XHTML, attributes must have a value.
700 # For 'reduced' form, return explicitly the attribute name here.
701 return $set[1];
702 } else {
703 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
704 }
705 }
706
707 /**
708 * Normalize whitespace and character references in an XML source-
709 * encoded text for an attribute value.
710 *
711 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
712 * but note that we're not returning the value, but are returning
713 * XML source fragments that will be slapped into output.
714 *
715 * @param string $text
716 * @return string
717 * @private
718 */
719 function normalizeAttributeValue( $text ) {
720 return str_replace( '"', '&quot;',
721 preg_replace(
722 '/\r\n|[\x20\x0d\x0a\x09]/',
723 ' ',
724 Sanitizer::normalizeCharReferences( $text ) ) );
725 }
726
727 /**
728 * Ensure that any entities and character references are legal
729 * for XML and XHTML specifically. Any stray bits will be
730 * &amp;-escaped to result in a valid text fragment.
731 *
732 * a. any named char refs must be known in XHTML
733 * b. any numeric char refs must be legal chars, not invalid or forbidden
734 * c. use &#x, not &#X
735 * d. fix or reject non-valid attributes
736 *
737 * @param string $text
738 * @return string
739 * @private
740 */
741 function normalizeCharReferences( $text ) {
742 return preg_replace_callback(
743 MW_CHAR_REFS_REGEX,
744 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
745 $text );
746 }
747 /**
748 * @param string $matches
749 * @return string
750 */
751 function normalizeCharReferencesCallback( $matches ) {
752 $ret = null;
753 if( $matches[1] != '' ) {
754 $ret = Sanitizer::normalizeEntity( $matches[1] );
755 } elseif( $matches[2] != '' ) {
756 $ret = Sanitizer::decCharReference( $matches[2] );
757 } elseif( $matches[3] != '' ) {
758 $ret = Sanitizer::hexCharReference( $matches[3] );
759 } elseif( $matches[4] != '' ) {
760 $ret = Sanitizer::hexCharReference( $matches[4] );
761 }
762 if( is_null( $ret ) ) {
763 return htmlspecialchars( $matches[0] );
764 } else {
765 return $ret;
766 }
767 }
768
769 /**
770 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
771 * return the named entity reference as is. Otherwise, returns
772 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
773 *
774 * @param string $name
775 * @return string
776 */
777 function normalizeEntity( $name ) {
778 global $wgHtmlEntities;
779 if( isset( $wgHtmlEntities[$name] ) ) {
780 return "&$name;";
781 } else {
782 return "&amp;$name;";
783 }
784 }
785
786 function decCharReference( $codepoint ) {
787 $point = intval( $codepoint );
788 if( Sanitizer::validateCodepoint( $point ) ) {
789 return sprintf( '&#%d;', $point );
790 } else {
791 return null;
792 }
793 }
794
795 function hexCharReference( $codepoint ) {
796 $point = hexdec( $codepoint );
797 if( Sanitizer::validateCodepoint( $point ) ) {
798 return sprintf( '&#x%x;', $point );
799 } else {
800 return null;
801 }
802 }
803
804 /**
805 * Returns true if a given Unicode codepoint is a valid character in XML.
806 * @param int $codepoint
807 * @return bool
808 */
809 function validateCodepoint( $codepoint ) {
810 return ($codepoint == 0x09)
811 || ($codepoint == 0x0a)
812 || ($codepoint == 0x0d)
813 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
814 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
815 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
816 }
817
818 /**
819 * Decode any character references, numeric or named entities,
820 * in the text and return a UTF-8 string.
821 *
822 * @param string $text
823 * @return string
824 * @public
825 */
826 function decodeCharReferences( $text ) {
827 return preg_replace_callback(
828 MW_CHAR_REFS_REGEX,
829 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
830 $text );
831 }
832
833 /**
834 * @param string $matches
835 * @return string
836 */
837 function decodeCharReferencesCallback( $matches ) {
838 if( $matches[1] != '' ) {
839 return Sanitizer::decodeEntity( $matches[1] );
840 } elseif( $matches[2] != '' ) {
841 return Sanitizer::decodeChar( intval( $matches[2] ) );
842 } elseif( $matches[3] != '' ) {
843 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
844 } elseif( $matches[4] != '' ) {
845 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
846 }
847 # Last case should be an ampersand by itself
848 return $matches[0];
849 }
850
851 /**
852 * Return UTF-8 string for a codepoint if that is a valid
853 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
854 * @param int $codepoint
855 * @return string
856 * @private
857 */
858 function decodeChar( $codepoint ) {
859 if( Sanitizer::validateCodepoint( $codepoint ) ) {
860 return codepointToUtf8( $codepoint );
861 } else {
862 return UTF8_REPLACEMENT;
863 }
864 }
865
866 /**
867 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
868 * return the UTF-8 encoding of that character. Otherwise, returns
869 * pseudo-entity source (eg &foo;)
870 *
871 * @param string $name
872 * @return string
873 */
874 function decodeEntity( $name ) {
875 global $wgHtmlEntities;
876 if( isset( $wgHtmlEntities[$name] ) ) {
877 return codepointToUtf8( $wgHtmlEntities[$name] );
878 } else {
879 return "&$name;";
880 }
881 }
882
883 /**
884 * Fetch the whitelist of acceptable attributes for a given
885 * element name.
886 *
887 * @param string $element
888 * @return array
889 */
890 function attributeWhitelist( $element ) {
891 static $list;
892 if( !isset( $list ) ) {
893 $list = Sanitizer::setupAttributeWhitelist();
894 }
895 return isset( $list[$element] )
896 ? $list[$element]
897 : array();
898 }
899
900 /**
901 * @return array
902 */
903 function setupAttributeWhitelist() {
904 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
905 $block = array_merge( $common, array( 'align' ) );
906 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
907 $tablecell = array( 'abbr',
908 'axis',
909 'headers',
910 'scope',
911 'rowspan',
912 'colspan',
913 'nowrap', # deprecated
914 'width', # deprecated
915 'height', # deprecated
916 'bgcolor' # deprecated
917 );
918
919 # Numbers refer to sections in HTML 4.01 standard describing the element.
920 # See: http://www.w3.org/TR/html4/
921 $whitelist = array (
922 # 7.5.4
923 'div' => $block,
924 'center' => $common, # deprecated
925 'span' => $block, # ??
926
927 # 7.5.5
928 'h1' => $block,
929 'h2' => $block,
930 'h3' => $block,
931 'h4' => $block,
932 'h5' => $block,
933 'h6' => $block,
934
935 # 7.5.6
936 # address
937
938 # 8.2.4
939 # bdo
940
941 # 9.2.1
942 'em' => $common,
943 'strong' => $common,
944 'cite' => $common,
945 # dfn
946 'code' => $common,
947 # samp
948 # kbd
949 'var' => $common,
950 # abbr
951 # acronym
952
953 # 9.2.2
954 'blockquote' => array_merge( $common, array( 'cite' ) ),
955 # q
956
957 # 9.2.3
958 'sub' => $common,
959 'sup' => $common,
960
961 # 9.3.1
962 'p' => $block,
963
964 # 9.3.2
965 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
966
967 # 9.3.4
968 'pre' => array_merge( $common, array( 'width' ) ),
969
970 # 9.4
971 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
972 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
973
974 # 10.2
975 'ul' => array_merge( $common, array( 'type' ) ),
976 'ol' => array_merge( $common, array( 'type', 'start' ) ),
977 'li' => array_merge( $common, array( 'type', 'value' ) ),
978
979 # 10.3
980 'dl' => $common,
981 'dd' => $common,
982 'dt' => $common,
983
984 # 11.2.1
985 'table' => array_merge( $common,
986 array( 'summary', 'width', 'border', 'frame',
987 'rules', 'cellspacing', 'cellpadding',
988 'align', 'bgcolor', 'frame', 'rules',
989 'border' ) ),
990
991 # 11.2.2
992 'caption' => array_merge( $common, array( 'align' ) ),
993
994 # 11.2.3
995 'thead' => array_merge( $common, $tablealign ),
996 'tfoot' => array_merge( $common, $tablealign ),
997 'tbody' => array_merge( $common, $tablealign ),
998
999 # 11.2.4
1000 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1001 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1002
1003 # 11.2.5
1004 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1005
1006 # 11.2.6
1007 'td' => array_merge( $common, $tablecell, $tablealign ),
1008 'th' => array_merge( $common, $tablecell, $tablealign ),
1009
1010 # 15.2.1
1011 'tt' => $common,
1012 'b' => $common,
1013 'i' => $common,
1014 'big' => $common,
1015 'small' => $common,
1016 'strike' => $common,
1017 's' => $common,
1018 'u' => $common,
1019
1020 # 15.2.2
1021 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1022 # basefont
1023
1024 # 15.3
1025 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1026
1027 # XHTML Ruby annotation text module, simple ruby only.
1028 # http://www.w3c.org/TR/ruby/
1029 'ruby' => $common,
1030 # rbc
1031 # rtc
1032 'rb' => $common,
1033 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1034 'rp' => $common,
1035 );
1036 return $whitelist;
1037 }
1038
1039 /**
1040 * Take a fragment of (potentially invalid) HTML and return
1041 * a version with any tags removed, encoded suitably for literal
1042 * inclusion in an attribute value.
1043 *
1044 * @param string $text HTML fragment
1045 * @return string
1046 */
1047 function stripAllTags( $text ) {
1048 # Actual <tags>
1049 $text = preg_replace( '/ < .*? > /x', '', $text );
1050
1051 # Normalize &entities and whitespace
1052 $text = Sanitizer::normalizeAttributeValue( $text );
1053
1054 # Will be placed into "double-quoted" attributes,
1055 # make sure remaining bits are safe.
1056 $text = str_replace(
1057 array('<', '>', '"'),
1058 array('&lt;', '&gt;', '&quot;'),
1059 $text );
1060
1061 return $text;
1062 }
1063
1064 /**
1065 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1066 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1067 * PHP 5.1 doesn't.
1068 *
1069 * Use for passing XHTML fragments to PHP's XML parsing functions
1070 *
1071 * @return string
1072 * @static
1073 */
1074 function hackDocType() {
1075 global $wgHtmlEntities;
1076 $out = "<!DOCTYPE html [\n";
1077 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1078 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1079 }
1080 $out .= "]>\n";
1081 return $out;
1082 }
1083
1084 }
1085
1086 ?>