typo
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2
3 /**
4 * (X)HTML sanitizer for MediaWiki
5 *
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
23 *
24 * @package MediaWiki
25 */
26
27 class Sanitizer {
28 /**
29 * Cleans up HTML, removes dangerous tags and attributes, and
30 * removes HTML comments
31 * @access private
32 */
33 function removeHTMLtags( $text ) {
34 global $wgUseTidy, $wgUserHtml;
35 $fname = 'Parser::removeHTMLtags';
36 wfProfileIn( $fname );
37
38 if( $wgUserHtml ) {
39 $htmlpairs = array( # Tags that must be closed
40 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
41 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
42 'strike', 'strong', 'tt', 'var', 'div', 'center',
43 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
44 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
45 );
46 $htmlsingle = array(
47 'br', 'hr', 'li', 'dt', 'dd'
48 );
49 $htmlnest = array( # Tags that can be nested--??
50 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
51 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
52 );
53 $tabletags = array( # Can only appear inside table
54 'td', 'th', 'tr'
55 );
56 } else {
57 $htmlpairs = array();
58 $htmlsingle = array();
59 $htmlnest = array();
60 $tabletags = array();
61 }
62
63 $htmlsingle = array_merge( $tabletags, $htmlsingle );
64 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
65
66 # Remove HTML comments
67 $text = Sanitizer::removeHTMLcomments( $text );
68
69 $bits = explode( '<', $text );
70 $text = array_shift( $bits );
71 if(!$wgUseTidy) {
72 $tagstack = array(); $tablestack = array();
73 foreach ( $bits as $x ) {
74 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
75 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
76 $x, $regs );
77 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
78 error_reporting( $prev );
79
80 $badtag = 0 ;
81 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
82 # Check our stack
83 if ( $slash ) {
84 # Closing a tag...
85 if ( ! in_array( $t, $htmlsingle ) &&
86 ( $ot = @array_pop( $tagstack ) ) != $t ) {
87 @array_push( $tagstack, $ot );
88 $badtag = 1;
89 } else {
90 if ( $t == 'table' ) {
91 $tagstack = array_pop( $tablestack );
92 }
93 $newparams = '';
94 }
95 } else {
96 # Keep track for later
97 if ( in_array( $t, $tabletags ) &&
98 ! in_array( 'table', $tagstack ) ) {
99 $badtag = 1;
100 } else if ( in_array( $t, $tagstack ) &&
101 ! in_array ( $t , $htmlnest ) ) {
102 $badtag = 1 ;
103 } else if ( ! in_array( $t, $htmlsingle ) ) {
104 if ( $t == 'table' ) {
105 array_push( $tablestack, $tagstack );
106 $tagstack = array();
107 }
108 array_push( $tagstack, $t );
109 }
110 # Strip non-approved attributes from the tag
111 $newparams = Sanitizer::fixTagAttributes( $params, $t );
112 }
113 if ( ! $badtag ) {
114 $rest = str_replace( '>', '&gt;', $rest );
115 $text .= "<$slash$t$newparams$brace$rest";
116 continue;
117 }
118 }
119 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
120 }
121 # Close off any remaining tags
122 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
123 $text .= "</$t>\n";
124 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
125 }
126 } else {
127 # this might be possible using tidy itself
128 foreach ( $bits as $x ) {
129 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
130 $x, $regs );
131 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
132 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
133 $newparams = Sanitizer::fixTagAttributes( $params, $t );
134 $rest = str_replace( '>', '&gt;', $rest );
135 $text .= "<$slash$t$newparams$brace$rest";
136 } else {
137 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
138 }
139 }
140 }
141 wfProfileOut( $fname );
142 return $text;
143 }
144
145 /**
146 * Remove '<!--', '-->', and everything between.
147 * To avoid leaving blank lines, when a comment is both preceded
148 * and followed by a newline (ignoring spaces), trim leading and
149 * trailing spaces and one of the newlines.
150 *
151 * @access private
152 */
153 function removeHTMLcomments( $text ) {
154 $fname='Parser::removeHTMLcomments';
155 wfProfileIn( $fname );
156 while (($start = strpos($text, '<!--')) !== false) {
157 $end = strpos($text, '-->', $start + 4);
158 if ($end === false) {
159 # Unterminated comment; bail out
160 break;
161 }
162
163 $end += 3;
164
165 # Trim space and newline if the comment is both
166 # preceded and followed by a newline
167 $spaceStart = max($start - 1, 0);
168 $spaceLen = $end - $spaceStart;
169 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
170 $spaceStart--;
171 $spaceLen++;
172 }
173 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
174 $spaceLen++;
175 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
176 # Remove the comment, leading and trailing
177 # spaces, and leave only one newline.
178 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
179 }
180 else {
181 # Remove just the comment.
182 $text = substr_replace($text, '', $start, $end - $start);
183 }
184 }
185 wfProfileOut( $fname );
186 return $text;
187 }
188
189 /**
190 * Take a tag soup fragment listing an HTML element's attributes
191 * and normalize it to well-formed XML, discarding unwanted attributes.
192 *
193 * - Normalizes attribute names to lowercase
194 * - Discards attributes not on a whitelist for the given element
195 * - Turns broken or invalid entities into plaintext
196 * - Double-quotes all attribute values
197 * - Attributes without values are given the name as attribute
198 * - Double attributes are discarded
199 * - Unsafe style attributes are discarded
200 * - Prepends space if there are attributes.
201 *
202 * @param string $text
203 * @param string $element
204 * @return string
205 *
206 * @todo Check for legal values where the DTD limits things.
207 * @todo Check for unique id attribute :P
208 */
209 function fixTagAttributes( $text, $element ) {
210 if( trim( $text ) == '' ) {
211 return '';
212 }
213
214 $attrib = '[A-Za-z0-9]'; #FIXME
215 $space = '[\x09\x0a\x0d\x20]';
216 if( !preg_match_all(
217 "/(?:^|$space)($attrib+)
218 ($space*=$space*
219 (?:
220 # The attribute value: quoted or alone
221 \"([^<\"]*)\"
222 | '([^<']*)'
223 | ([a-zA-Z0-9._:-]+)
224 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
225 # colors are specified like this.
226 # We'll be normalizing it.
227 )
228 )?(?=$space|\$)/sx",
229 $text,
230 $pairs,
231 PREG_SET_ORDER ) ) {
232 return '';
233 }
234
235 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
236 $attribs = array();
237 foreach( $pairs as $set ) {
238 $attribute = strtolower( $set[1] );
239 if( !isset( $whitelist[$attribute] ) ) {
240 continue;
241 }
242 if( $set[2] == '' ) {
243 # In XHTML, attributes must have a value.
244 $value = $set[1];
245 } elseif( $set[3] != '' ) {
246 # Double-quoted
247 $value = Sanitizer::normalizeAttributeValue( $set[3] );
248 } elseif( $set[4] != '' ) {
249 # Single-quoted
250 $value = str_replace( '"', '&quot;',
251 Sanitizer::normalizeAttributeValue( $set[4] ) );
252 } elseif( $set[5] != '' ) {
253 # No quotes.
254 $value = Sanitizer::normalizeAttributeValue( $set[5] );
255 } elseif( $set[6] != '' ) {
256 # Illegal #XXXXXX color with no quotes.
257 $value = Sanitizer::normalizeAttributeValue( $set[6] );
258 } else {
259 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
260 }
261
262 # Strip javascript "expression" from stylesheets.
263 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
264 if( $attribute == 'style' && preg_match(
265 '/(expression|tps*:\/\/|url\\s*\().*/is',
266 wfMungeToUtf8( $value ) ) ) {
267 # haxx0r
268 continue;
269 }
270
271 if( !isset( $attribs[$attribute] ) ) {
272 $attribs[$attribute] = "$attribute=\"$value\"";
273 }
274 }
275 if( empty( $attribs ) ) {
276 return '';
277 } else {
278 return ' ' . implode( ' ', $attribs );
279 }
280 }
281
282 /**
283 * Normalize whitespace and character references in an XML source-
284 * encoded text for an attribute value.
285 *
286 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
287 * but note that we're not returning the value, but are returning
288 * XML source fragments that will be slapped into output.
289 *
290 * @param string $text
291 * @return string
292 * @access private
293 */
294 function normalizeAttributeValue( $text ) {
295 return preg_replace(
296 '/\r\n|[\x20\x0d\x0a\x09]/',
297 ' ',
298 Sanitizer::normalizeCharReferences( $text ) );
299 }
300
301 /**
302 * Ensure that any entities and character references are legal
303 * for XML and XHTML specifically. Any stray bits will be
304 * &amp;-escaped to result in a valid text fragment.
305 *
306 * a. any named char refs must be known in XHTML
307 * b. any numeric char refs must be legal chars, not invalid or forbidden
308 * c. use &#x, not &#X
309 * d. fix or reject non-valid attributes
310 *
311 * @param string $text
312 * @return string
313 * @access private
314 */
315 function normalizeCharReferences( $text ) {
316 return preg_replace_callback(
317 '/&([A-Za-z0-9]+);
318 |&\#([0-9]+);
319 |&\#x([0-9A-Za-z]+);
320 |&\#X([0-9A-Za-z]+);
321 |(&)/x',
322 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
323 $text );
324 }
325
326 function normalizeCharReferencesCallback( $matches ) {
327 $ret = null;
328 if( $matches[1] != '' ) {
329 $ret = Sanitizer::normalizeEntity( $matches[1] );
330 } elseif( $matches[2] != '' ) {
331 $ret = Sanitizer::decCharReference( $matches[2] );
332 } elseif( $matches[3] != '' ) {
333 $ret = Sanitizer::hexCharReference( $matches[3] );
334 } elseif( $matches[4] != '' ) {
335 $ret = Sanitizer::hexCharReference( $matches[4] );
336 }
337 if( is_null( $ret ) ) {
338 return htmlspecialchars( $matches[0] );
339 } else {
340 return $ret;
341 }
342 }
343
344 /**
345 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
346 * return the named entity reference as is. Otherwise, returns
347 * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
348 *
349 * @return string
350 */
351 function normalizeEntity( $name ) {
352 # List of all named character entities defined in HTML 4.01
353 # http://www.w3.org/TR/html4/sgml/entities.html
354 static $htmlEntities = array(
355 'aacute' => true,
356 'Aacute' => true,
357 'acirc' => true,
358 'Acirc' => true,
359 'acute' => true,
360 'aelig' => true,
361 'AElig' => true,
362 'agrave' => true,
363 'Agrave' => true,
364 'alefsym' => true,
365 'alpha' => true,
366 'Alpha' => true,
367 'amp' => true,
368 'and' => true,
369 'ang' => true,
370 'apos' => true,
371 'aring' => true,
372 'Aring' => true,
373 'asymp' => true,
374 'atilde' => true,
375 'Atilde' => true,
376 'auml' => true,
377 'Auml' => true,
378 'bdquo' => true,
379 'beta' => true,
380 'Beta' => true,
381 'brvbar' => true,
382 'bull' => true,
383 'cap' => true,
384 'ccedil' => true,
385 'Ccedil' => true,
386 'cedil' => true,
387 'cent' => true,
388 'chi' => true,
389 'Chi' => true,
390 'circ' => true,
391 'clubs' => true,
392 'cong' => true,
393 'copy' => true,
394 'crarr' => true,
395 'cup' => true,
396 'curren' => true,
397 'dagger' => true,
398 'Dagger' => true,
399 'darr' => true,
400 'dArr' => true,
401 'deg' => true,
402 'delta' => true,
403 'Delta' => true,
404 'diams' => true,
405 'divide' => true,
406 'eacute' => true,
407 'Eacute' => true,
408 'ecirc' => true,
409 'Ecirc' => true,
410 'egrave' => true,
411 'Egrave' => true,
412 'empty' => true,
413 'emsp' => true,
414 'ensp' => true,
415 'epsilon' => true,
416 'Epsilon' => true,
417 'equiv' => true,
418 'eta' => true,
419 'Eta' => true,
420 'eth' => true,
421 'ETH' => true,
422 'euml' => true,
423 'Euml' => true,
424 'euro' => true,
425 'exist' => true,
426 'fnof' => true,
427 'forall' => true,
428 'frac12' => true,
429 'frac14' => true,
430 'frac34' => true,
431 'frasl' => true,
432 'gamma' => true,
433 'Gamma' => true,
434 'ge' => true,
435 'gt' => true,
436 'harr' => true,
437 'hArr' => true,
438 'hearts' => true,
439 'hellip' => true,
440 'iacute' => true,
441 'Iacute' => true,
442 'icirc' => true,
443 'Icirc' => true,
444 'iexcl' => true,
445 'igrave' => true,
446 'Igrave' => true,
447 'image' => true,
448 'infin' => true,
449 'int' => true,
450 'iota' => true,
451 'Iota' => true,
452 'iquest' => true,
453 'isin' => true,
454 'iuml' => true,
455 'Iuml' => true,
456 'kappa' => true,
457 'Kappa' => true,
458 'lambda' => true,
459 'Lambda' => true,
460 'lang' => true,
461 'laquo' => true,
462 'larr' => true,
463 'lArr' => true,
464 'lceil' => true,
465 'ldquo' => true,
466 'le' => true,
467 'lfloor' => true,
468 'lowast' => true,
469 'loz' => true,
470 'lrm' => true,
471 'lsaquo' => true,
472 'lsquo' => true,
473 'lt' => true,
474 'macr' => true,
475 'mdash' => true,
476 'micro' => true,
477 'middot' => true,
478 'minus' => true,
479 'mu' => true,
480 'Mu' => true,
481 'nabla' => true,
482 'nbsp' => true,
483 'ndash' => true,
484 'ne' => true,
485 'ni' => true,
486 'not' => true,
487 'notin' => true,
488 'nsub' => true,
489 'ntilde' => true,
490 'Ntilde' => true,
491 'nu' => true,
492 'Nu' => true,
493 'oacute' => true,
494 'Oacute' => true,
495 'ocirc' => true,
496 'Ocirc' => true,
497 'oelig' => true,
498 'OElig' => true,
499 'ograve' => true,
500 'Ograve' => true,
501 'oline' => true,
502 'omega' => true,
503 'Omega' => true,
504 'omicron' => true,
505 'Omicron' => true,
506 'oplus' => true,
507 'or' => true,
508 'ordf' => true,
509 'ordm' => true,
510 'oslash' => true,
511 'Oslash' => true,
512 'otilde' => true,
513 'Otilde' => true,
514 'otimes' => true,
515 'ouml' => true,
516 'Ouml' => true,
517 'para' => true,
518 'part' => true,
519 'permil' => true,
520 'perp' => true,
521 'phi' => true,
522 'Phi' => true,
523 'pi' => true,
524 'Pi' => true,
525 'piv' => true,
526 'plusmn' => true,
527 'pound' => true,
528 'prime' => true,
529 'Prime' => true,
530 'prod' => true,
531 'prop' => true,
532 'psi' => true,
533 'Psi' => true,
534 'quot' => true,
535 'radic' => true,
536 'rang' => true,
537 'raquo' => true,
538 'rarr' => true,
539 'rArr' => true,
540 'rceil' => true,
541 'rdquo' => true,
542 'real' => true,
543 'reg' => true,
544 'rfloor' => true,
545 'rho' => true,
546 'Rho' => true,
547 'rlm' => true,
548 'rsaquo' => true,
549 'rsquo' => true,
550 'sbquo' => true,
551 'scaron' => true,
552 'Scaron' => true,
553 'sdot' => true,
554 'sect' => true,
555 'shy' => true,
556 'sigma' => true,
557 'Sigma' => true,
558 'sigmaf' => true,
559 'sim' => true,
560 'spades' => true,
561 'sub' => true,
562 'sube' => true,
563 'sum' => true,
564 'sup' => true,
565 'sup1' => true,
566 'sup2' => true,
567 'sup3' => true,
568 'supe' => true,
569 'szlig' => true,
570 'tau' => true,
571 'Tau' => true,
572 'there4' => true,
573 'theta' => true,
574 'Theta' => true,
575 'thetasym' => true,
576 'thinsp' => true,
577 'thorn' => true,
578 'THORN' => true,
579 'tilde' => true,
580 'times' => true,
581 'trade' => true,
582 'uacute' => true,
583 'Uacute' => true,
584 'uarr' => true,
585 'uArr' => true,
586 'ucirc' => true,
587 'Ucirc' => true,
588 'ugrave' => true,
589 'Ugrave' => true,
590 'uml' => true,
591 'upsih' => true,
592 'upsilon' => true,
593 'Upsilon' => true,
594 'uuml' => true,
595 'Uuml' => true,
596 'weierp' => true,
597 'xi' => true,
598 'Xi' => true,
599 'yacute' => true,
600 'Yacute' => true,
601 'yen' => true,
602 'yuml' => true,
603 'Yuml' => true,
604 'zeta' => true,
605 'Zeta' => true,
606 'zwj' => true,
607 'zwnj' => true );
608 if( isset( $htmlEntities[$name] ) ) {
609 return "&$name;";
610 } else {
611 return "&amp;$name;";
612 }
613 }
614
615 function decCharReference( $codepoint ) {
616 $point = IntVal( $codepoint );
617 if( Sanitizer::validateCodepoint( $point ) ) {
618 return sprintf( '&#%d;', $point );
619 } else {
620 return null;
621 }
622 }
623
624 function hexCharReference( $codepoint ) {
625 $point = hexdec( $codepoint );
626 if( Sanitizer::validateCodepoint( $point ) ) {
627 return sprintf( '&#x%x;', $point );
628 } else {
629 return null;
630 }
631 }
632
633 /**
634 * Returns true if a given Unicode codepoint is a valid character in XML.
635 * @param int $codepoint
636 * @return bool
637 */
638 function validateCodepoint( $codepoint ) {
639 return ($codepoint == 0x09)
640 || ($codepoint == 0x0a)
641 || ($codepoint == 0x0d)
642 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
643 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
644 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
645 }
646
647 /**
648 * Fetch the whitelist of acceptable attributes for a given
649 * element name.
650 *
651 * @param string $element
652 * @return array
653 */
654 function attributeWhitelist( $element ) {
655 $list = Sanitizer::setupAttributeWhitelist();
656 return isset( $list[$element] )
657 ? $list[$element]
658 : array();
659 }
660
661 /**
662 * @return array
663 */
664 function setupAttributeWhitelist() {
665 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
666 $block = array_merge( $common, array( 'align' ) );
667 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
668 $tablecell = array( 'abbr',
669 'axis',
670 'headers',
671 'scope',
672 'rowspan',
673 'colspan',
674 'nowrap', # deprecated
675 'width', # deprecated
676 'height' # deprecated
677 );
678
679 # Numbers refer to sections in HTML 4.01 standard describing the element.
680 # See: http://www.w3.org/TR/html4/
681 $whitelist = array (
682 # 7.5.4
683 'div' => $block,
684 'center' => $common, # deprecated
685 'span' => $block, # ??
686
687 # 7.5.5
688 'h1' => $block,
689 'h2' => $block,
690 'h3' => $block,
691 'h4' => $block,
692 'h5' => $block,
693 'h6' => $block,
694
695 # 7.5.6
696 # address
697
698 # 8.2.4
699 # bdo
700
701 # 9.2.1
702 'em' => $common,
703 'strong' => $common,
704 'cite' => $common,
705 # dfn
706 'code' => $common,
707 # samp
708 # kbd
709 'var' => $common,
710 # abbr
711 # acronym
712
713 # 9.2.2
714 'blockquote' => array_merge( $common, array( 'cite' ) ),
715 # q
716
717 # 9.2.3
718 'sub' => $common,
719 'sup' => $common,
720
721 # 9.3.1
722 'p' => $block,
723
724 # 9.3.2
725 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
726
727 # 9.3.4
728 'pre' => array_merge( $common, array( 'width' ) ),
729
730 # 9.4
731 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
732 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
733
734 # 10.2
735 'ul' => array_merge( $common, array( 'type' ) ),
736 'ol' => array_merge( $common, array( 'type', 'start' ) ),
737 'li' => array_merge( $common, array( 'type', 'value' ) ),
738
739 # 10.3
740 'dl' => $common,
741 'dd' => $common,
742 'dt' => $common,
743
744 # 11.2.1
745 'table' => array_merge( $common,
746 array( 'summary', 'width', 'border', 'frame',
747 'rules', 'cellspacing', 'cellpadding',
748 'align', 'bgcolor', 'frame', 'rules',
749 'border' ) ),
750
751 # 11.2.2
752 'caption' => array_merge( $common, array( 'align' ) ),
753
754 # 11.2.3
755 'thead' => array_merge( $common, $tablealign ),
756 'tfoot' => array_merge( $common, $tablealign ),
757 'tbody' => array_merge( $common, $tablealign ),
758
759 # 11.2.4
760 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
761 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
762
763 # 11.2.5
764 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
765
766 # 11.2.6
767 'td' => array_merge( $common, $tablecell, $tablealign ),
768 'th' => array_merge( $common, $tablecell, $tablealign ),
769
770 # 15.2.1
771 'tt' => $common,
772 'b' => $common,
773 'i' => $common,
774 'big' => $common,
775 'small' => $common,
776 'strike' => $common,
777 's' => $common,
778 'u' => $common,
779
780 # 15.2.2
781 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
782 # basefont
783
784 # 15.3
785 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
786
787 # XHTML Ruby annotation text module, simple ruby only.
788 # http://www.w3c.org/TR/ruby/
789 'ruby' => $common,
790 # rbc
791 # rtc
792 'rb' => $common,
793 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
794 'rp' => $common,
795 );
796 return $whitelist;
797 }
798
799 /**
800 * Take a fragment of (potentially invalid) HTML and return
801 * a version with any tags removed, encoded suitably for literal
802 * inclusion in an attribute value.
803 *
804 * @param string $text HTML fragment
805 * @return string
806 */
807 function stripAllTags( $text ) {
808 # Actual <tags>
809 $text = preg_replace( '/<[^>]*>/', '', $text );
810
811 # Normalize &entities and whitespace
812 $text = Sanitizer::normalizeAttributeValue( $text );
813
814 # Will be placed into "double-quoted" attributes,
815 # make sure remaining bits are safe.
816 $text = str_replace(
817 array('<', '>', '"'),
818 array('&lt;', '&gt;', '&quot;'),
819 $text );
820
821 return $text;
822 }
823
824 }
825
826 ?>