unused variables as per #3692
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2 /**
3 * XHTML sanitizer for MediaWiki
4 *
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @package MediaWiki
24 * @subpackage Parser
25 */
26
27 /**
28 * Regular expression to match various types of character references in
29 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
30 */
31 define( 'MW_CHAR_REFS_REGEX',
32 '/&([A-Za-z0-9]+);
33 |&\#([0-9]+);
34 |&\#x([0-9A-Za-z]+);
35 |&\#X([0-9A-Za-z]+);
36 |(&)/x' );
37
38 /**
39 * Regular expression to match HTML/XML attribute pairs within a tag.
40 * Allows some... latitude.
41 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
42 */
43 $attrib = '[A-Za-z0-9]';
44 $space = '[\x09\x0a\x0d\x20]';
45 define( 'MW_ATTRIBS_REGEX',
46 "/(?:^|$space)($attrib+)
47 ($space*=$space*
48 (?:
49 # The attribute value: quoted or alone
50 \"([^<\"]*)\"
51 | '([^<']*)'
52 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
53 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
54 # colors are specified like this.
55 # We'll be normalizing it.
56 )
57 )?(?=$space|\$)/sx" );
58
59 /**
60 * List of all named character entities defined in HTML 4.01
61 * http://www.w3.org/TR/html4/sgml/entities.html
62 * @private
63 */
64 global $wgHtmlEntities;
65 $wgHtmlEntities = array(
66 'Aacute' => 193,
67 'aacute' => 225,
68 'Acirc' => 194,
69 'acirc' => 226,
70 'acute' => 180,
71 'AElig' => 198,
72 'aelig' => 230,
73 'Agrave' => 192,
74 'agrave' => 224,
75 'alefsym' => 8501,
76 'Alpha' => 913,
77 'alpha' => 945,
78 'amp' => 38,
79 'and' => 8743,
80 'ang' => 8736,
81 'Aring' => 197,
82 'aring' => 229,
83 'asymp' => 8776,
84 'Atilde' => 195,
85 'atilde' => 227,
86 'Auml' => 196,
87 'auml' => 228,
88 'bdquo' => 8222,
89 'Beta' => 914,
90 'beta' => 946,
91 'brvbar' => 166,
92 'bull' => 8226,
93 'cap' => 8745,
94 'Ccedil' => 199,
95 'ccedil' => 231,
96 'cedil' => 184,
97 'cent' => 162,
98 'Chi' => 935,
99 'chi' => 967,
100 'circ' => 710,
101 'clubs' => 9827,
102 'cong' => 8773,
103 'copy' => 169,
104 'crarr' => 8629,
105 'cup' => 8746,
106 'curren' => 164,
107 'dagger' => 8224,
108 'Dagger' => 8225,
109 'darr' => 8595,
110 'dArr' => 8659,
111 'deg' => 176,
112 'Delta' => 916,
113 'delta' => 948,
114 'diams' => 9830,
115 'divide' => 247,
116 'Eacute' => 201,
117 'eacute' => 233,
118 'Ecirc' => 202,
119 'ecirc' => 234,
120 'Egrave' => 200,
121 'egrave' => 232,
122 'empty' => 8709,
123 'emsp' => 8195,
124 'ensp' => 8194,
125 'Epsilon' => 917,
126 'epsilon' => 949,
127 'equiv' => 8801,
128 'Eta' => 919,
129 'eta' => 951,
130 'ETH' => 208,
131 'eth' => 240,
132 'Euml' => 203,
133 'euml' => 235,
134 'euro' => 8364,
135 'exist' => 8707,
136 'fnof' => 402,
137 'forall' => 8704,
138 'frac12' => 189,
139 'frac14' => 188,
140 'frac34' => 190,
141 'frasl' => 8260,
142 'Gamma' => 915,
143 'gamma' => 947,
144 'ge' => 8805,
145 'gt' => 62,
146 'harr' => 8596,
147 'hArr' => 8660,
148 'hearts' => 9829,
149 'hellip' => 8230,
150 'Iacute' => 205,
151 'iacute' => 237,
152 'Icirc' => 206,
153 'icirc' => 238,
154 'iexcl' => 161,
155 'Igrave' => 204,
156 'igrave' => 236,
157 'image' => 8465,
158 'infin' => 8734,
159 'int' => 8747,
160 'Iota' => 921,
161 'iota' => 953,
162 'iquest' => 191,
163 'isin' => 8712,
164 'Iuml' => 207,
165 'iuml' => 239,
166 'Kappa' => 922,
167 'kappa' => 954,
168 'Lambda' => 923,
169 'lambda' => 955,
170 'lang' => 9001,
171 'laquo' => 171,
172 'larr' => 8592,
173 'lArr' => 8656,
174 'lceil' => 8968,
175 'ldquo' => 8220,
176 'le' => 8804,
177 'lfloor' => 8970,
178 'lowast' => 8727,
179 'loz' => 9674,
180 'lrm' => 8206,
181 'lsaquo' => 8249,
182 'lsquo' => 8216,
183 'lt' => 60,
184 'macr' => 175,
185 'mdash' => 8212,
186 'micro' => 181,
187 'middot' => 183,
188 'minus' => 8722,
189 'Mu' => 924,
190 'mu' => 956,
191 'nabla' => 8711,
192 'nbsp' => 160,
193 'ndash' => 8211,
194 'ne' => 8800,
195 'ni' => 8715,
196 'not' => 172,
197 'notin' => 8713,
198 'nsub' => 8836,
199 'Ntilde' => 209,
200 'ntilde' => 241,
201 'Nu' => 925,
202 'nu' => 957,
203 'Oacute' => 211,
204 'oacute' => 243,
205 'Ocirc' => 212,
206 'ocirc' => 244,
207 'OElig' => 338,
208 'oelig' => 339,
209 'Ograve' => 210,
210 'ograve' => 242,
211 'oline' => 8254,
212 'Omega' => 937,
213 'omega' => 969,
214 'Omicron' => 927,
215 'omicron' => 959,
216 'oplus' => 8853,
217 'or' => 8744,
218 'ordf' => 170,
219 'ordm' => 186,
220 'Oslash' => 216,
221 'oslash' => 248,
222 'Otilde' => 213,
223 'otilde' => 245,
224 'otimes' => 8855,
225 'Ouml' => 214,
226 'ouml' => 246,
227 'para' => 182,
228 'part' => 8706,
229 'permil' => 8240,
230 'perp' => 8869,
231 'Phi' => 934,
232 'phi' => 966,
233 'Pi' => 928,
234 'pi' => 960,
235 'piv' => 982,
236 'plusmn' => 177,
237 'pound' => 163,
238 'prime' => 8242,
239 'Prime' => 8243,
240 'prod' => 8719,
241 'prop' => 8733,
242 'Psi' => 936,
243 'psi' => 968,
244 'quot' => 34,
245 'radic' => 8730,
246 'rang' => 9002,
247 'raquo' => 187,
248 'rarr' => 8594,
249 'rArr' => 8658,
250 'rceil' => 8969,
251 'rdquo' => 8221,
252 'real' => 8476,
253 'reg' => 174,
254 'rfloor' => 8971,
255 'Rho' => 929,
256 'rho' => 961,
257 'rlm' => 8207,
258 'rsaquo' => 8250,
259 'rsquo' => 8217,
260 'sbquo' => 8218,
261 'Scaron' => 352,
262 'scaron' => 353,
263 'sdot' => 8901,
264 'sect' => 167,
265 'shy' => 173,
266 'Sigma' => 931,
267 'sigma' => 963,
268 'sigmaf' => 962,
269 'sim' => 8764,
270 'spades' => 9824,
271 'sub' => 8834,
272 'sube' => 8838,
273 'sum' => 8721,
274 'sup' => 8835,
275 'sup1' => 185,
276 'sup2' => 178,
277 'sup3' => 179,
278 'supe' => 8839,
279 'szlig' => 223,
280 'Tau' => 932,
281 'tau' => 964,
282 'there4' => 8756,
283 'Theta' => 920,
284 'theta' => 952,
285 'thetasym' => 977,
286 'thinsp' => 8201,
287 'THORN' => 222,
288 'thorn' => 254,
289 'tilde' => 732,
290 'times' => 215,
291 'trade' => 8482,
292 'Uacute' => 218,
293 'uacute' => 250,
294 'uarr' => 8593,
295 'uArr' => 8657,
296 'Ucirc' => 219,
297 'ucirc' => 251,
298 'Ugrave' => 217,
299 'ugrave' => 249,
300 'uml' => 168,
301 'upsih' => 978,
302 'Upsilon' => 933,
303 'upsilon' => 965,
304 'Uuml' => 220,
305 'uuml' => 252,
306 'weierp' => 8472,
307 'Xi' => 926,
308 'xi' => 958,
309 'Yacute' => 221,
310 'yacute' => 253,
311 'yen' => 165,
312 'Yuml' => 376,
313 'yuml' => 255,
314 'Zeta' => 918,
315 'zeta' => 950,
316 'zwj' => 8205,
317 'zwnj' => 8204 );
318
319 /** @package MediaWiki */
320 class Sanitizer {
321 /**
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
324 * @private
325 * @param string $text
326 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
327 * @param array $args for the processing callback
328 * @return string
329 */
330 function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
331 global $wgUseTidy, $wgUserHtml;
332 $fname = 'Parser::removeHTMLtags';
333 wfProfileIn( $fname );
334
335 if( $wgUserHtml ) {
336 $htmlpairs = array( # Tags that must be closed
337 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
338 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
339 'strike', 'strong', 'tt', 'var', 'div', 'center',
340 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
341 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
342 );
343 $htmlsingle = array(
344 'br', 'hr', 'li', 'dt', 'dd'
345 );
346 $htmlsingleonly = array( # Elements that cannot have close tags
347 'br', 'hr'
348 );
349 $htmlnest = array( # Tags that can be nested--??
350 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
351 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
352 );
353 $tabletags = array( # Can only appear inside table
354 'td', 'th', 'tr'
355 );
356 } else {
357 $htmlpairs = array();
358 $htmlsingle = array();
359 $htmlnest = array();
360 $tabletags = array();
361 }
362
363 $htmlsingle = array_merge( $tabletags, $htmlsingle );
364 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
365
366 # Remove HTML comments
367 $text = Sanitizer::removeHTMLcomments( $text );
368 $bits = explode( '<', $text );
369 $text = array_shift( $bits );
370 if(!$wgUseTidy) {
371 $tagstack = array(); $tablestack = array();
372 foreach ( $bits as $x ) {
373 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
374 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
375 $x, $regs );
376 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
377 error_reporting( $prev );
378
379 $badtag = 0 ;
380 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
381 # Check our stack
382 if ( $slash ) {
383 # Closing a tag...
384 if( in_array( $t, $htmlsingleonly ) ) {
385 $badtag = 1;
386 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
387 @array_push( $tagstack, $ot );
388 $badtag = 1;
389 } else {
390 if ( $t == 'table' ) {
391 $tagstack = array_pop( $tablestack );
392 }
393 $newparams = '';
394 }
395 } else {
396 # Keep track for later
397 if ( in_array( $t, $tabletags ) &&
398 ! in_array( 'table', $tagstack ) ) {
399 $badtag = 1;
400 } else if ( in_array( $t, $tagstack ) &&
401 ! in_array ( $t , $htmlnest ) ) {
402 $badtag = 1 ;
403 # Is it a self closed htmlpair ? (bug 5487)
404 } else if( $brace == '/>' &&
405 in_array($t, $htmlpairs) ) {
406 $badtag = 1;
407 } elseif( in_array( $t, $htmlsingleonly ) ) {
408 # Hack to force empty tag for uncloseable elements
409 $brace = '/>';
410 } else if( in_array( $t, $htmlsingle ) ) {
411 # Hack to not close $htmlsingle tags
412 $brace = NULL;
413 } else {
414 if ( $t == 'table' ) {
415 array_push( $tablestack, $tagstack );
416 $tagstack = array();
417 }
418 array_push( $tagstack, $t );
419 }
420
421 # Replace any variables or template parameters with
422 # plaintext results.
423 if( is_callable( $processCallback ) ) {
424 call_user_func_array( $processCallback, array( &$params, $args ) );
425 }
426
427 # Strip non-approved attributes from the tag
428 $newparams = Sanitizer::fixTagAttributes( $params, $t );
429 }
430 if ( ! $badtag ) {
431 $rest = str_replace( '>', '&gt;', $rest );
432 $close = ( $brace == '/>' ) ? ' /' : '';
433 $text .= "<$slash$t$newparams$close>$rest";
434 continue;
435 }
436 }
437 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
438 }
439 # Close off any remaining tags
440 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
441 $text .= "</$t>\n";
442 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
443 }
444 } else {
445 # this might be possible using tidy itself
446 foreach ( $bits as $x ) {
447 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
448 $x, $regs );
449 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
450 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
451 if( is_callable( $processCallback ) ) {
452 call_user_func_array( $processCallback, array( &$params, $args ) );
453 }
454 $newparams = Sanitizer::fixTagAttributes( $params, $t );
455 $rest = str_replace( '>', '&gt;', $rest );
456 $text .= "<$slash$t$newparams$brace$rest";
457 } else {
458 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
459 }
460 }
461 }
462 wfProfileOut( $fname );
463 return $text;
464 }
465
466 /**
467 * Remove '<!--', '-->', and everything between.
468 * To avoid leaving blank lines, when a comment is both preceded
469 * and followed by a newline (ignoring spaces), trim leading and
470 * trailing spaces and one of the newlines.
471 *
472 * @private
473 * @param string $text
474 * @return string
475 */
476 function removeHTMLcomments( $text ) {
477 $fname='Parser::removeHTMLcomments';
478 wfProfileIn( $fname );
479 while (($start = strpos($text, '<!--')) !== false) {
480 $end = strpos($text, '-->', $start + 4);
481 if ($end === false) {
482 # Unterminated comment; bail out
483 break;
484 }
485
486 $end += 3;
487
488 # Trim space and newline if the comment is both
489 # preceded and followed by a newline
490 $spaceStart = max($start - 1, 0);
491 $spaceLen = $end - $spaceStart;
492 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
493 $spaceStart--;
494 $spaceLen++;
495 }
496 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
497 $spaceLen++;
498 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
499 # Remove the comment, leading and trailing
500 # spaces, and leave only one newline.
501 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
502 }
503 else {
504 # Remove just the comment.
505 $text = substr_replace($text, '', $start, $end - $start);
506 }
507 }
508 wfProfileOut( $fname );
509 return $text;
510 }
511
512 /**
513 * Take a tag soup fragment listing an HTML element's attributes
514 * and normalize it to well-formed XML, discarding unwanted attributes.
515 *
516 * - Normalizes attribute names to lowercase
517 * - Discards attributes not on a whitelist for the given element
518 * - Turns broken or invalid entities into plaintext
519 * - Double-quotes all attribute values
520 * - Attributes without values are given the name as attribute
521 * - Double attributes are discarded
522 * - Unsafe style attributes are discarded
523 * - Prepends space if there are attributes.
524 *
525 * @param string $text
526 * @param string $element
527 * @return string
528 *
529 * @todo Check for legal values where the DTD limits things.
530 * @todo Check for unique id attribute :P
531 */
532 function fixTagAttributes( $text, $element ) {
533 if( trim( $text ) == '' ) {
534 return '';
535 }
536
537 # Unquoted attribute
538 # Since we quote this later, this can be anything distinguishable
539 # from the end of the attribute
540 $pairs = array();
541 if( !preg_match_all(
542 MW_ATTRIBS_REGEX,
543 $text,
544 $pairs,
545 PREG_SET_ORDER ) ) {
546 return '';
547 }
548
549 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
550 $attribs = array();
551 foreach( $pairs as $set ) {
552 $attribute = strtolower( $set[1] );
553 if( !isset( $whitelist[$attribute] ) ) {
554 continue;
555 }
556
557 $raw = Sanitizer::getTagAttributeCallback( $set );
558 $value = Sanitizer::normalizeAttributeValue( $raw );
559
560 # Strip javascript "expression" from stylesheets.
561 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
562 if( $attribute == 'style' ) {
563 $stripped = Sanitizer::decodeCharReferences( $value );
564
565 // Remove any comments; IE gets token splitting wrong
566 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
567 $value = htmlspecialchars( $stripped );
568
569 // ... and continue checks
570 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
571 'codepointToUtf8(hexdec("$1"))', $stripped );
572 $stripped = str_replace( '\\', '', $stripped );
573 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
574 $stripped ) ) {
575 # haxx0r
576 continue;
577 }
578 }
579
580 if ( $attribute === 'id' )
581 $value = Sanitizer::escapeId( $value );
582
583 # Templates and links may be expanded in later parsing,
584 # creating invalid or dangerous output. Suppress this.
585 $value = strtr( $value, array(
586 '<' => '&lt;', // This should never happen,
587 '>' => '&gt;', // we've received invalid input
588 '"' => '&quot;', // which should have been escaped.
589 '{' => '&#123;',
590 '[' => '&#91;',
591 "''" => '&#39;&#39;',
592 'ISBN' => '&#73;SBN',
593 'RFC' => '&#82;FC',
594 'PMID' => '&#80;MID',
595 ) );
596
597 # Stupid hack
598 $value = preg_replace_callback(
599 '/(' . wfUrlProtocols() . ')/',
600 array( 'Sanitizer', 'armorLinksCallback' ),
601 $value );
602
603 // If this attribute was previously set, override it.
604 // Output should only have one attribute of each name.
605 $attribs[$attribute] = "$attribute=\"$value\"";
606 }
607
608 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
609 }
610
611 /**
612 * Given a value escape it so that it can be used in an id attribute and
613 * return it, this does not validate the value however (see first link)
614 *
615 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
616 * in the id and
617 * name attributes
618 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
619 *
620 * @bug 4461
621 *
622 * @static
623 *
624 * @param string $id
625 * @return string
626 */
627 function escapeId( $id ) {
628 static $replace = array(
629 '%3A' => ':',
630 '%' => '.'
631 );
632
633 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
634
635 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
636 }
637
638 /**
639 * Regex replace callback for armoring links against further processing.
640 * @param array $matches
641 * @return string
642 * @private
643 */
644 function armorLinksCallback( $matches ) {
645 return str_replace( ':', '&#58;', $matches[1] );
646 }
647
648 /**
649 * Return an associative array of attribute names and values from
650 * a partial tag string. Attribute names are forces to lowercase,
651 * character references are decoded to UTF-8 text.
652 *
653 * @param string
654 * @return array
655 */
656 function decodeTagAttributes( $text ) {
657 $attribs = array();
658
659 if( trim( $text ) == '' ) {
660 return $attribs;
661 }
662
663 $pairs = array();
664 if( !preg_match_all(
665 MW_ATTRIBS_REGEX,
666 $text,
667 $pairs,
668 PREG_SET_ORDER ) ) {
669 return $attribs;
670 }
671
672 foreach( $pairs as $set ) {
673 $attribute = strtolower( $set[1] );
674 $value = Sanitizer::getTagAttributeCallback( $set );
675 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
676 }
677 return $attribs;
678 }
679
680 /**
681 * Pick the appropriate attribute value from a match set from the
682 * MW_ATTRIBS_REGEX matches.
683 *
684 * @param array $set
685 * @return string
686 * @private
687 */
688 function getTagAttributeCallback( $set ) {
689 if( isset( $set[6] ) ) {
690 # Illegal #XXXXXX color with no quotes.
691 return $set[6];
692 } elseif( isset( $set[5] ) ) {
693 # No quotes.
694 return $set[5];
695 } elseif( isset( $set[4] ) ) {
696 # Single-quoted
697 return $set[4];
698 } elseif( isset( $set[3] ) ) {
699 # Double-quoted
700 return $set[3];
701 } elseif( !isset( $set[2] ) ) {
702 # In XHTML, attributes must have a value.
703 # For 'reduced' form, return explicitly the attribute name here.
704 return $set[1];
705 } else {
706 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
707 }
708 }
709
710 /**
711 * Normalize whitespace and character references in an XML source-
712 * encoded text for an attribute value.
713 *
714 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
715 * but note that we're not returning the value, but are returning
716 * XML source fragments that will be slapped into output.
717 *
718 * @param string $text
719 * @return string
720 * @private
721 */
722 function normalizeAttributeValue( $text ) {
723 return str_replace( '"', '&quot;',
724 preg_replace(
725 '/\r\n|[\x20\x0d\x0a\x09]/',
726 ' ',
727 Sanitizer::normalizeCharReferences( $text ) ) );
728 }
729
730 /**
731 * Ensure that any entities and character references are legal
732 * for XML and XHTML specifically. Any stray bits will be
733 * &amp;-escaped to result in a valid text fragment.
734 *
735 * a. any named char refs must be known in XHTML
736 * b. any numeric char refs must be legal chars, not invalid or forbidden
737 * c. use &#x, not &#X
738 * d. fix or reject non-valid attributes
739 *
740 * @param string $text
741 * @return string
742 * @private
743 */
744 function normalizeCharReferences( $text ) {
745 return preg_replace_callback(
746 MW_CHAR_REFS_REGEX,
747 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
748 $text );
749 }
750 /**
751 * @param string $matches
752 * @return string
753 */
754 function normalizeCharReferencesCallback( $matches ) {
755 $ret = null;
756 if( $matches[1] != '' ) {
757 $ret = Sanitizer::normalizeEntity( $matches[1] );
758 } elseif( $matches[2] != '' ) {
759 $ret = Sanitizer::decCharReference( $matches[2] );
760 } elseif( $matches[3] != '' ) {
761 $ret = Sanitizer::hexCharReference( $matches[3] );
762 } elseif( $matches[4] != '' ) {
763 $ret = Sanitizer::hexCharReference( $matches[4] );
764 }
765 if( is_null( $ret ) ) {
766 return htmlspecialchars( $matches[0] );
767 } else {
768 return $ret;
769 }
770 }
771
772 /**
773 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
774 * return the named entity reference as is. Otherwise, returns
775 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
776 *
777 * @param string $name
778 * @return string
779 */
780 function normalizeEntity( $name ) {
781 global $wgHtmlEntities;
782 if( isset( $wgHtmlEntities[$name] ) ) {
783 return "&$name;";
784 } else {
785 return "&amp;$name;";
786 }
787 }
788
789 function decCharReference( $codepoint ) {
790 $point = intval( $codepoint );
791 if( Sanitizer::validateCodepoint( $point ) ) {
792 return sprintf( '&#%d;', $point );
793 } else {
794 return null;
795 }
796 }
797
798 function hexCharReference( $codepoint ) {
799 $point = hexdec( $codepoint );
800 if( Sanitizer::validateCodepoint( $point ) ) {
801 return sprintf( '&#x%x;', $point );
802 } else {
803 return null;
804 }
805 }
806
807 /**
808 * Returns true if a given Unicode codepoint is a valid character in XML.
809 * @param int $codepoint
810 * @return bool
811 */
812 function validateCodepoint( $codepoint ) {
813 return ($codepoint == 0x09)
814 || ($codepoint == 0x0a)
815 || ($codepoint == 0x0d)
816 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
817 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
818 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
819 }
820
821 /**
822 * Decode any character references, numeric or named entities,
823 * in the text and return a UTF-8 string.
824 *
825 * @param string $text
826 * @return string
827 * @public
828 */
829 function decodeCharReferences( $text ) {
830 return preg_replace_callback(
831 MW_CHAR_REFS_REGEX,
832 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
833 $text );
834 }
835
836 /**
837 * @param string $matches
838 * @return string
839 */
840 function decodeCharReferencesCallback( $matches ) {
841 if( $matches[1] != '' ) {
842 return Sanitizer::decodeEntity( $matches[1] );
843 } elseif( $matches[2] != '' ) {
844 return Sanitizer::decodeChar( intval( $matches[2] ) );
845 } elseif( $matches[3] != '' ) {
846 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
847 } elseif( $matches[4] != '' ) {
848 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
849 }
850 # Last case should be an ampersand by itself
851 return $matches[0];
852 }
853
854 /**
855 * Return UTF-8 string for a codepoint if that is a valid
856 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
857 * @param int $codepoint
858 * @return string
859 * @private
860 */
861 function decodeChar( $codepoint ) {
862 if( Sanitizer::validateCodepoint( $codepoint ) ) {
863 return codepointToUtf8( $codepoint );
864 } else {
865 return UTF8_REPLACEMENT;
866 }
867 }
868
869 /**
870 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
871 * return the UTF-8 encoding of that character. Otherwise, returns
872 * pseudo-entity source (eg &foo;)
873 *
874 * @param string $name
875 * @return string
876 */
877 function decodeEntity( $name ) {
878 global $wgHtmlEntities;
879 if( isset( $wgHtmlEntities[$name] ) ) {
880 return codepointToUtf8( $wgHtmlEntities[$name] );
881 } else {
882 return "&$name;";
883 }
884 }
885
886 /**
887 * Fetch the whitelist of acceptable attributes for a given
888 * element name.
889 *
890 * @param string $element
891 * @return array
892 */
893 function attributeWhitelist( $element ) {
894 static $list;
895 if( !isset( $list ) ) {
896 $list = Sanitizer::setupAttributeWhitelist();
897 }
898 return isset( $list[$element] )
899 ? $list[$element]
900 : array();
901 }
902
903 /**
904 * @return array
905 */
906 function setupAttributeWhitelist() {
907 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
908 $block = array_merge( $common, array( 'align' ) );
909 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
910 $tablecell = array( 'abbr',
911 'axis',
912 'headers',
913 'scope',
914 'rowspan',
915 'colspan',
916 'nowrap', # deprecated
917 'width', # deprecated
918 'height', # deprecated
919 'bgcolor' # deprecated
920 );
921
922 # Numbers refer to sections in HTML 4.01 standard describing the element.
923 # See: http://www.w3.org/TR/html4/
924 $whitelist = array (
925 # 7.5.4
926 'div' => $block,
927 'center' => $common, # deprecated
928 'span' => $block, # ??
929
930 # 7.5.5
931 'h1' => $block,
932 'h2' => $block,
933 'h3' => $block,
934 'h4' => $block,
935 'h5' => $block,
936 'h6' => $block,
937
938 # 7.5.6
939 # address
940
941 # 8.2.4
942 # bdo
943
944 # 9.2.1
945 'em' => $common,
946 'strong' => $common,
947 'cite' => $common,
948 # dfn
949 'code' => $common,
950 # samp
951 # kbd
952 'var' => $common,
953 # abbr
954 # acronym
955
956 # 9.2.2
957 'blockquote' => array_merge( $common, array( 'cite' ) ),
958 # q
959
960 # 9.2.3
961 'sub' => $common,
962 'sup' => $common,
963
964 # 9.3.1
965 'p' => $block,
966
967 # 9.3.2
968 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
969
970 # 9.3.4
971 'pre' => array_merge( $common, array( 'width' ) ),
972
973 # 9.4
974 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
975 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
976
977 # 10.2
978 'ul' => array_merge( $common, array( 'type' ) ),
979 'ol' => array_merge( $common, array( 'type', 'start' ) ),
980 'li' => array_merge( $common, array( 'type', 'value' ) ),
981
982 # 10.3
983 'dl' => $common,
984 'dd' => $common,
985 'dt' => $common,
986
987 # 11.2.1
988 'table' => array_merge( $common,
989 array( 'summary', 'width', 'border', 'frame',
990 'rules', 'cellspacing', 'cellpadding',
991 'align', 'bgcolor', 'frame', 'rules',
992 'border' ) ),
993
994 # 11.2.2
995 'caption' => array_merge( $common, array( 'align' ) ),
996
997 # 11.2.3
998 'thead' => array_merge( $common, $tablealign ),
999 'tfoot' => array_merge( $common, $tablealign ),
1000 'tbody' => array_merge( $common, $tablealign ),
1001
1002 # 11.2.4
1003 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1004 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1005
1006 # 11.2.5
1007 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1008
1009 # 11.2.6
1010 'td' => array_merge( $common, $tablecell, $tablealign ),
1011 'th' => array_merge( $common, $tablecell, $tablealign ),
1012
1013 # 15.2.1
1014 'tt' => $common,
1015 'b' => $common,
1016 'i' => $common,
1017 'big' => $common,
1018 'small' => $common,
1019 'strike' => $common,
1020 's' => $common,
1021 'u' => $common,
1022
1023 # 15.2.2
1024 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1025 # basefont
1026
1027 # 15.3
1028 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1029
1030 # XHTML Ruby annotation text module, simple ruby only.
1031 # http://www.w3c.org/TR/ruby/
1032 'ruby' => $common,
1033 # rbc
1034 # rtc
1035 'rb' => $common,
1036 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1037 'rp' => $common,
1038 );
1039 return $whitelist;
1040 }
1041
1042 /**
1043 * Take a fragment of (potentially invalid) HTML and return
1044 * a version with any tags removed, encoded suitably for literal
1045 * inclusion in an attribute value.
1046 *
1047 * @param string $text HTML fragment
1048 * @return string
1049 */
1050 function stripAllTags( $text ) {
1051 # Actual <tags>
1052 $text = preg_replace( '/ < .*? > /x', '', $text );
1053
1054 # Normalize &entities and whitespace
1055 $text = Sanitizer::normalizeAttributeValue( $text );
1056
1057 # Will be placed into "double-quoted" attributes,
1058 # make sure remaining bits are safe.
1059 $text = str_replace(
1060 array('<', '>', '"'),
1061 array('&lt;', '&gt;', '&quot;'),
1062 $text );
1063
1064 return $text;
1065 }
1066
1067 /**
1068 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1069 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1070 * PHP 5.1 doesn't.
1071 *
1072 * Use for passing XHTML fragments to PHP's XML parsing functions
1073 *
1074 * @return string
1075 * @static
1076 */
1077 function hackDocType() {
1078 global $wgHtmlEntities;
1079 $out = "<!DOCTYPE html [\n";
1080 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1081 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1082 }
1083 $out .= "]>\n";
1084 return $out;
1085 }
1086
1087 }
1088
1089 ?>