3 * XHTML sanitizer for MediaWiki
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
28 * Regular expression to match various types of character references in
29 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
31 define( 'MW_CHAR_REFS_REGEX',
39 * Regular expression to match HTML/XML attribute pairs within a tag.
40 * Allows some... latitude.
41 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
43 $attrib = '[A-Za-z0-9]';
44 $space = '[\x09\x0a\x0d\x20]';
45 define( 'MW_ATTRIBS_REGEX',
46 "/(?:^|$space)($attrib+)
49 # The attribute value: quoted or alone
52 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
53 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
54 # colors are specified like this.
55 # We'll be normalizing it.
57 )?(?=$space|\$)/sx" );
60 * List of all named character entities defined in HTML 4.01
61 * http://www.w3.org/TR/html4/sgml/entities.html
64 global $wgHtmlEntities;
65 $wgHtmlEntities = array(
319 /** @package MediaWiki */
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
325 * @param string $text
326 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
327 * @param array $args for the processing callback
330 function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
331 global $wgUseTidy, $wgUserHtml;
332 $fname = 'Parser::removeHTMLtags';
333 wfProfileIn( $fname );
336 $htmlpairs = array( # Tags that must be closed
337 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
338 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
339 'strike', 'strong', 'tt', 'var', 'div', 'center',
340 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
341 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
344 'br', 'hr', 'li', 'dt', 'dd'
346 $htmlsingleonly = array( # Elements that cannot have close tags
349 $htmlnest = array( # Tags that can be nested--??
350 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
351 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
353 $tabletags = array( # Can only appear inside table
357 $htmlpairs = array();
358 $htmlsingle = array();
360 $tabletags = array();
363 $htmlsingle = array_merge( $tabletags, $htmlsingle );
364 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
366 # Remove HTML comments
367 $text = Sanitizer
::removeHTMLcomments( $text );
369 $bits = explode( '<', $text );
370 $text = array_shift( $bits );
372 $tagstack = array(); $tablestack = array();
373 foreach ( $bits as $x ) {
374 $prev = error_reporting( E_ALL
& ~
( E_NOTICE | E_WARNING
) );
375 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
377 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
378 error_reporting( $prev );
381 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
385 if( in_array( $t, $htmlsingleonly ) ) {
387 } elseif ( ( $ot = @array_pop
( $tagstack ) ) != $t ) {
388 @array_push
( $tagstack, $ot );
391 if ( $t == 'table' ) {
392 $tagstack = array_pop( $tablestack );
397 # Keep track for later
398 if ( in_array( $t, $tabletags ) &&
399 ! in_array( 'table', $tagstack ) ) {
401 } else if ( in_array( $t, $tagstack ) &&
402 ! in_array ( $t , $htmlnest ) ) {
404 } elseif( in_array( $t, $htmlsingleonly ) ) {
405 # Hack to force empty tag for uncloseable elements
407 } else if( in_array( $t, $htmlsingle ) ) {
408 # Hack to not close $htmlsingle tags
411 if ( $t == 'table' ) {
412 array_push( $tablestack, $tagstack );
415 array_push( $tagstack, $t );
418 # Replace any variables or template parameters with
420 if( is_callable( $processCallback ) ) {
421 call_user_func_array( $processCallback, array( &$params, $args ) );
424 # Strip non-approved attributes from the tag
425 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
428 $rest = str_replace( '>', '>', $rest );
429 $close = ( $brace == '/>' ) ?
' /' : '';
430 $text .= "<$slash$t$newparams$close>$rest";
434 $text .= '<' . str_replace( '>', '>', $x);
436 # Close off any remaining tags
437 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
439 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
442 # this might be possible using tidy itself
443 foreach ( $bits as $x ) {
444 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
446 @list
( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
447 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
448 if( is_callable( $processCallback ) ) {
449 call_user_func_array( $processCallback, array( &$params, $args ) );
451 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
452 $rest = str_replace( '>', '>', $rest );
453 $text .= "<$slash$t$newparams$brace$rest";
455 $text .= '<' . str_replace( '>', '>', $x);
459 wfProfileOut( $fname );
464 * Remove '<!--', '-->', and everything between.
465 * To avoid leaving blank lines, when a comment is both preceded
466 * and followed by a newline (ignoring spaces), trim leading and
467 * trailing spaces and one of the newlines.
470 * @param string $text
473 function removeHTMLcomments( $text ) {
474 $fname='Parser::removeHTMLcomments';
475 wfProfileIn( $fname );
476 while (($start = strpos($text, '<!--')) !== false) {
477 $end = strpos($text, '-->', $start +
4);
478 if ($end === false) {
479 # Unterminated comment; bail out
485 # Trim space and newline if the comment is both
486 # preceded and followed by a newline
487 $spaceStart = max($start - 1, 0);
488 $spaceLen = $end - $spaceStart;
489 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
493 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
495 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
496 # Remove the comment, leading and trailing
497 # spaces, and leave only one newline.
498 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
501 # Remove just the comment.
502 $text = substr_replace($text, '', $start, $end - $start);
505 wfProfileOut( $fname );
510 * Take a tag soup fragment listing an HTML element's attributes
511 * and normalize it to well-formed XML, discarding unwanted attributes.
513 * - Normalizes attribute names to lowercase
514 * - Discards attributes not on a whitelist for the given element
515 * - Turns broken or invalid entities into plaintext
516 * - Double-quotes all attribute values
517 * - Attributes without values are given the name as attribute
518 * - Double attributes are discarded
519 * - Unsafe style attributes are discarded
520 * - Prepends space if there are attributes.
522 * @param string $text
523 * @param string $element
526 * @todo Check for legal values where the DTD limits things.
527 * @todo Check for unique id attribute :P
529 function fixTagAttributes( $text, $element ) {
530 if( trim( $text ) == '' ) {
535 # Since we quote this later, this can be anything distinguishable
536 # from the end of the attribute
546 $whitelist = array_flip( Sanitizer
::attributeWhitelist( $element ) );
548 foreach( $pairs as $set ) {
549 $attribute = strtolower( $set[1] );
550 if( !isset( $whitelist[$attribute] ) ) {
554 $raw = Sanitizer
::getTagAttributeCallback( $set );
555 $value = Sanitizer
::normalizeAttributeValue( $raw );
557 # Strip javascript "expression" from stylesheets.
558 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
559 if( $attribute == 'style' ) {
560 $stripped = Sanitizer
::decodeCharReferences( $value );
562 // Remove any comments; IE gets token splitting wrong
563 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
564 $value = htmlspecialchars( $stripped );
566 // ... and continue checks
567 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
568 'codepointToUtf8(hexdec("$1"))', $stripped );
569 $stripped = str_replace( '\\', '', $stripped );
570 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
577 if ( $attribute === 'id' )
578 $value = Sanitizer
::escapeId( $value );
580 # Templates and links may be expanded in later parsing,
581 # creating invalid or dangerous output. Suppress this.
582 $value = strtr( $value, array(
583 '<' => '<', // This should never happen,
584 '>' => '>', // we've received invalid input
585 '"' => '"', // which should have been escaped.
588 "''" => '''',
589 'ISBN' => 'ISBN',
591 'PMID' => 'PMID',
595 $value = preg_replace_callback(
596 '/(' . wfUrlProtocols() . ')/',
597 array( 'Sanitizer', 'armorLinksCallback' ),
600 // If this attribute was previously set, override it.
601 // Output should only have one attribute of each name.
602 $attribs[$attribute] = "$attribute=\"$value\"";
605 return count( $attribs ) ?
' ' . implode( ' ', $attribs ) : '';
609 * Given a value escape it so that it can be used in an id attribute and
610 * return it, this does not validate the value however (see first link)
612 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
615 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
624 function escapeId( $id ) {
625 static $replace = array(
630 $id = urlencode( Sanitizer
::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
632 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
636 * Regex replace callback for armoring links against further processing.
637 * @param array $matches
641 function armorLinksCallback( $matches ) {
642 return str_replace( ':', ':', $matches[1] );
646 * Return an associative array of attribute names and values from
647 * a partial tag string. Attribute names are forces to lowercase,
648 * character references are decoded to UTF-8 text.
653 function decodeTagAttributes( $text ) {
656 if( trim( $text ) == '' ) {
669 foreach( $pairs as $set ) {
670 $attribute = strtolower( $set[1] );
671 $value = Sanitizer
::getTagAttributeCallback( $set );
672 $attribs[$attribute] = Sanitizer
::decodeCharReferences( $value );
678 * Pick the appropriate attribute value from a match set from the
679 * MW_ATTRIBS_REGEX matches.
685 function getTagAttributeCallback( $set ) {
686 if( isset( $set[6] ) ) {
687 # Illegal #XXXXXX color with no quotes.
689 } elseif( isset( $set[5] ) ) {
692 } elseif( isset( $set[4] ) ) {
695 } elseif( isset( $set[3] ) ) {
698 } elseif( !isset( $set[2] ) ) {
699 # In XHTML, attributes must have a value.
700 # For 'reduced' form, return explicitly the attribute name here.
703 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
708 * Normalize whitespace and character references in an XML source-
709 * encoded text for an attribute value.
711 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
712 * but note that we're not returning the value, but are returning
713 * XML source fragments that will be slapped into output.
715 * @param string $text
719 function normalizeAttributeValue( $text ) {
720 return str_replace( '"', '"',
722 '/\r\n|[\x20\x0d\x0a\x09]/',
724 Sanitizer
::normalizeCharReferences( $text ) ) );
728 * Ensure that any entities and character references are legal
729 * for XML and XHTML specifically. Any stray bits will be
730 * &-escaped to result in a valid text fragment.
732 * a. any named char refs must be known in XHTML
733 * b. any numeric char refs must be legal chars, not invalid or forbidden
734 * c. use &#x, not &#X
735 * d. fix or reject non-valid attributes
737 * @param string $text
741 function normalizeCharReferences( $text ) {
742 return preg_replace_callback(
744 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
748 * @param string $matches
751 function normalizeCharReferencesCallback( $matches ) {
753 if( $matches[1] != '' ) {
754 $ret = Sanitizer
::normalizeEntity( $matches[1] );
755 } elseif( $matches[2] != '' ) {
756 $ret = Sanitizer
::decCharReference( $matches[2] );
757 } elseif( $matches[3] != '' ) {
758 $ret = Sanitizer
::hexCharReference( $matches[3] );
759 } elseif( $matches[4] != '' ) {
760 $ret = Sanitizer
::hexCharReference( $matches[4] );
762 if( is_null( $ret ) ) {
763 return htmlspecialchars( $matches[0] );
770 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
771 * return the named entity reference as is. Otherwise, returns
772 * HTML-escaped text of pseudo-entity source (eg &foo;)
774 * @param string $name
777 function normalizeEntity( $name ) {
778 global $wgHtmlEntities;
779 if( isset( $wgHtmlEntities[$name] ) ) {
782 return "&$name;";
786 function decCharReference( $codepoint ) {
787 $point = intval( $codepoint );
788 if( Sanitizer
::validateCodepoint( $point ) ) {
789 return sprintf( '&#%d;', $point );
795 function hexCharReference( $codepoint ) {
796 $point = hexdec( $codepoint );
797 if( Sanitizer
::validateCodepoint( $point ) ) {
798 return sprintf( '&#x%x;', $point );
805 * Returns true if a given Unicode codepoint is a valid character in XML.
806 * @param int $codepoint
809 function validateCodepoint( $codepoint ) {
810 return ($codepoint == 0x09)
811 ||
($codepoint == 0x0a)
812 ||
($codepoint == 0x0d)
813 ||
($codepoint >= 0x20 && $codepoint <= 0xd7ff)
814 ||
($codepoint >= 0xe000 && $codepoint <= 0xfffd)
815 ||
($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
819 * Decode any character references, numeric or named entities,
820 * in the text and return a UTF-8 string.
822 * @param string $text
826 function decodeCharReferences( $text ) {
827 return preg_replace_callback(
829 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
834 * @param string $matches
837 function decodeCharReferencesCallback( $matches ) {
838 if( $matches[1] != '' ) {
839 return Sanitizer
::decodeEntity( $matches[1] );
840 } elseif( $matches[2] != '' ) {
841 return Sanitizer
::decodeChar( intval( $matches[2] ) );
842 } elseif( $matches[3] != '' ) {
843 return Sanitizer
::decodeChar( hexdec( $matches[3] ) );
844 } elseif( $matches[4] != '' ) {
845 return Sanitizer
::decodeChar( hexdec( $matches[4] ) );
847 # Last case should be an ampersand by itself
852 * Return UTF-8 string for a codepoint if that is a valid
853 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
854 * @param int $codepoint
858 function decodeChar( $codepoint ) {
859 if( Sanitizer
::validateCodepoint( $codepoint ) ) {
860 return codepointToUtf8( $codepoint );
862 return UTF8_REPLACEMENT
;
867 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
868 * return the UTF-8 encoding of that character. Otherwise, returns
869 * pseudo-entity source (eg &foo;)
871 * @param string $name
874 function decodeEntity( $name ) {
875 global $wgHtmlEntities;
876 if( isset( $wgHtmlEntities[$name] ) ) {
877 return codepointToUtf8( $wgHtmlEntities[$name] );
884 * Fetch the whitelist of acceptable attributes for a given
887 * @param string $element
890 function attributeWhitelist( $element ) {
892 if( !isset( $list ) ) {
893 $list = Sanitizer
::setupAttributeWhitelist();
895 return isset( $list[$element] )
903 function setupAttributeWhitelist() {
904 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
905 $block = array_merge( $common, array( 'align' ) );
906 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
907 $tablecell = array( 'abbr',
913 'nowrap', # deprecated
914 'width', # deprecated
915 'height', # deprecated
916 'bgcolor' # deprecated
919 # Numbers refer to sections in HTML 4.01 standard describing the element.
920 # See: http://www.w3.org/TR/html4/
924 'center' => $common, # deprecated
925 'span' => $block, # ??
954 'blockquote' => array_merge( $common, array( 'cite' ) ),
965 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
968 'pre' => array_merge( $common, array( 'width' ) ),
971 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
972 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
975 'ul' => array_merge( $common, array( 'type' ) ),
976 'ol' => array_merge( $common, array( 'type', 'start' ) ),
977 'li' => array_merge( $common, array( 'type', 'value' ) ),
985 'table' => array_merge( $common,
986 array( 'summary', 'width', 'border', 'frame',
987 'rules', 'cellspacing', 'cellpadding',
988 'align', 'bgcolor', 'frame', 'rules',
992 'caption' => array_merge( $common, array( 'align' ) ),
995 'thead' => array_merge( $common, $tablealign ),
996 'tfoot' => array_merge( $common, $tablealign ),
997 'tbody' => array_merge( $common, $tablealign ),
1000 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1001 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1004 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1007 'td' => array_merge( $common, $tablecell, $tablealign ),
1008 'th' => array_merge( $common, $tablecell, $tablealign ),
1016 'strike' => $common,
1021 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1025 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1027 # XHTML Ruby annotation text module, simple ruby only.
1028 # http://www.w3c.org/TR/ruby/
1033 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1040 * Take a fragment of (potentially invalid) HTML and return
1041 * a version with any tags removed, encoded suitably for literal
1042 * inclusion in an attribute value.
1044 * @param string $text HTML fragment
1047 function stripAllTags( $text ) {
1049 $text = preg_replace( '/ < .*? > /x', '', $text );
1051 # Normalize &entities and whitespace
1052 $text = Sanitizer
::normalizeAttributeValue( $text );
1054 # Will be placed into "double-quoted" attributes,
1055 # make sure remaining bits are safe.
1056 $text = str_replace(
1057 array('<', '>', '"'),
1058 array('<', '>', '"'),
1065 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1066 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1069 * Use for passing XHTML fragments to PHP's XML parsing functions
1074 function hackDocType() {
1075 global $wgHtmlEntities;
1076 $out = "<!DOCTYPE html [\n";
1077 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1078 $out .= "<!ENTITY $entity \"&#$codepoint;\">";