3 * XHTML sanitizer for MediaWiki
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
27 * Regular expression to match various types of character references in
28 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
30 define( 'MW_CHAR_REFS_REGEX',
38 * Regular expression to match HTML/XML attribute pairs within a tag.
39 * Allows some... latitude.
40 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
42 $attrib = '[A-Za-z0-9]';
43 $space = '[\x09\x0a\x0d\x20]';
44 define( 'MW_ATTRIBS_REGEX',
45 "/(?:^|$space)($attrib+)
48 # The attribute value: quoted or alone
51 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
52 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
53 # colors are specified like this.
54 # We'll be normalizing it.
56 )?(?=$space|\$)/sx" );
59 * List of all named character entities defined in HTML 4.01
60 * http://www.w3.org/TR/html4/sgml/entities.html
63 global $wgHtmlEntities;
64 $wgHtmlEntities = array(
320 * Cleans up HTML, removes dangerous tags and attributes, and
321 * removes HTML comments
323 * @param string $text
324 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
325 * @param array $args for the processing callback
328 static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
331 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
332 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
334 wfProfileIn( __METHOD__
);
336 if ( !$staticInitialised ) {
338 $htmlpairs = array( # Tags that must be closed
339 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
340 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
341 'strike', 'strong', 'tt', 'var', 'div', 'center',
342 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
343 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
346 'br', 'hr', 'li', 'dt', 'dd'
348 $htmlsingleonly = array( # Elements that cannot have close tags
351 $htmlnest = array( # Tags that can be nested--??
352 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
353 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
355 $tabletags = array( # Can only appear inside table, we will close them
358 $htmllist = array( # Tags used by list
361 $listtags = array( # Tags that can appear in a list
365 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
366 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
368 # Convert them all to hashtables for faster lookup
369 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
370 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
371 foreach ( $vars as $var ) {
372 $
$var = array_flip( $
$var );
374 $staticInitialised = true;
377 # Remove HTML comments
378 $text = Sanitizer
::removeHTMLcomments( $text );
379 $bits = explode( '<', $text );
380 $text = str_replace( '>', '>', array_shift( $bits ) );
382 $tagstack = $tablestack = array();
383 foreach ( $bits as $x ) {
385 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
386 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
388 $slash = $t = $params = $brace = $rest = null;
392 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
396 if( isset( $htmlsingleonly[$t] ) ) {
398 } elseif ( ( $ot = @array_pop
( $tagstack ) ) != $t ) {
399 if ( isset( $htmlsingleallowed[$ot] ) ) {
400 # Pop all elements with an optional close tag
401 # and see if we find a match below them
403 array_push ($optstack, $ot);
404 while ( ( ( $ot = @array_pop
( $tagstack ) ) != $t ) &&
405 isset( $htmlsingleallowed[$ot] ) )
407 array_push ($optstack, $ot);
410 # No match. Push the optinal elements back again
412 while ( $ot = @array_pop
( $optstack ) ) {
413 array_push( $tagstack, $ot );
417 @array_push
( $tagstack, $ot );
418 # <li> can be nested in <ul> or <ol>, skip those cases:
419 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
424 if ( $t == 'table' ) {
425 $tagstack = array_pop( $tablestack );
430 # Keep track for later
431 if ( isset( $tabletags[$t] ) &&
432 ! in_array( 'table', $tagstack ) ) {
434 } else if ( in_array( $t, $tagstack ) &&
435 ! isset( $htmlnest [$t ] ) ) {
437 #Â Is it a self closed htmlpair ? (bug 5487)
438 } else if( $brace == '/>' &&
439 isset( $htmlpairs[$t] ) ) {
441 } elseif( isset( $htmlsingleonly[$t] ) ) {
442 # Hack to force empty tag for uncloseable elements
444 } else if( isset( $htmlsingle[$t] ) ) {
445 # Hack to not close $htmlsingle tags
447 } else if( isset( $tabletags[$t] )
448 && in_array($t ,$tagstack) ) {
449 // New table tag but forgot to close the previous one
452 if ( $t == 'table' ) {
453 array_push( $tablestack, $tagstack );
456 array_push( $tagstack, $t );
459 # Replace any variables or template parameters with
461 if( is_callable( $processCallback ) ) {
462 call_user_func_array( $processCallback, array( &$params, $args ) );
465 # Strip non-approved attributes from the tag
466 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
469 $rest = str_replace( '>', '>', $rest );
470 $close = ( $brace == '/>' && !$slash ) ?
' /' : '';
471 $text .= "<$slash$t$newparams$close>$rest";
475 $text .= '<' . str_replace( '>', '>', $x);
477 # Close off any remaining tags
478 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
480 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
483 # this might be possible using tidy itself
484 foreach ( $bits as $x ) {
485 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
487 @list
( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
488 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
489 if( is_callable( $processCallback ) ) {
490 call_user_func_array( $processCallback, array( &$params, $args ) );
492 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
493 $rest = str_replace( '>', '>', $rest );
494 $text .= "<$slash$t$newparams$brace$rest";
496 $text .= '<' . str_replace( '>', '>', $x);
500 wfProfileOut( __METHOD__
);
505 * Remove '<!--', '-->', and everything between.
506 * To avoid leaving blank lines, when a comment is both preceded
507 * and followed by a newline (ignoring spaces), trim leading and
508 * trailing spaces and one of the newlines.
511 * @param string $text
514 static function removeHTMLcomments( $text ) {
515 wfProfileIn( __METHOD__
);
516 while (($start = strpos($text, '<!--')) !== false) {
517 $end = strpos($text, '-->', $start +
4);
518 if ($end === false) {
519 # Unterminated comment; bail out
525 # Trim space and newline if the comment is both
526 # preceded and followed by a newline
527 $spaceStart = max($start - 1, 0);
528 $spaceLen = $end - $spaceStart;
529 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
533 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
535 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
536 # Remove the comment, leading and trailing
537 # spaces, and leave only one newline.
538 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
541 # Remove just the comment.
542 $text = substr_replace($text, '', $start, $end - $start);
545 wfProfileOut( __METHOD__
);
550 * Take an array of attribute names and values and normalize or discard
551 * illegal values for the given element type.
553 * - Discards attributes not on a whitelist for the given element
554 * - Unsafe style attributes are discarded
556 * @param array $attribs
557 * @param string $element
560 * @todo Check for legal values where the DTD limits things.
561 * @todo Check for unique id attribute :P
563 static function validateTagAttributes( $attribs, $element ) {
564 $whitelist = array_flip( Sanitizer
::attributeWhitelist( $element ) );
566 foreach( $attribs as $attribute => $value ) {
567 if( !isset( $whitelist[$attribute] ) ) {
570 # Strip javascript "expression" from stylesheets.
571 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
572 if( $attribute == 'style' ) {
573 $value = Sanitizer
::checkCss( $value );
574 if( $value === false ) {
580 if ( $attribute === 'id' )
581 $value = Sanitizer
::escapeId( $value );
583 // If this attribute was previously set, override it.
584 // Output should only have one attribute of each name.
585 $out[$attribute] = $value;
591 * Pick apart some CSS and check it for forbidden or unsafe structures.
592 * Returns a sanitized string, or false if it was just too evil.
594 * Currently URL references, 'expression', 'tps' are forbidden.
596 * @param string $value
599 static function checkCss( $value ) {
600 $stripped = Sanitizer
::decodeCharReferences( $value );
602 // Remove any comments; IE gets token splitting wrong
603 $stripped = StringUtils
::delimiterReplace( '/*', '*/', ' ', $stripped );
607 // ... and continue checks
608 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
609 'codepointToUtf8(hexdec("$1"))', $stripped );
610 $stripped = str_replace( '\\', '', $stripped );
611 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
621 * Take a tag soup fragment listing an HTML element's attributes
622 * and normalize it to well-formed XML, discarding unwanted attributes.
623 * Output is safe for further wikitext processing, with escaping of
624 * values that could trigger problems.
626 * - Normalizes attribute names to lowercase
627 * - Discards attributes not on a whitelist for the given element
628 * - Turns broken or invalid entities into plaintext
629 * - Double-quotes all attribute values
630 * - Attributes without values are given the name as attribute
631 * - Double attributes are discarded
632 * - Unsafe style attributes are discarded
633 * - Prepends space if there are attributes.
635 * @param string $text
636 * @param string $element
639 static function fixTagAttributes( $text, $element ) {
640 if( trim( $text ) == '' ) {
644 $stripped = Sanitizer
::validateTagAttributes(
645 Sanitizer
::decodeTagAttributes( $text ), $element );
648 foreach( $stripped as $attribute => $value ) {
649 $encAttribute = htmlspecialchars( $attribute );
650 $encValue = Sanitizer
::safeEncodeAttribute( $value );
652 $attribs[] = "$encAttribute=\"$encValue\"";
654 return count( $attribs ) ?
' ' . implode( ' ', $attribs ) : '';
658 * Encode an attribute value for HTML output.
660 * @return HTML-encoded text fragment
662 static function encodeAttribute( $text ) {
663 $encValue = htmlspecialchars( $text );
665 // Whitespace is normalized during attribute decoding,
666 // so if we've been passed non-spaces we must encode them
667 // ahead of time or they won't be preserved.
668 $encValue = strtr( $encValue, array(
678 * Encode an attribute value for HTML tags, with extra armoring
679 * against further wiki processing.
681 * @return HTML-encoded text fragment
683 static function safeEncodeAttribute( $text ) {
684 $encValue = Sanitizer
::encodeAttribute( $text );
686 # Templates and links may be expanded in later parsing,
687 # creating invalid or dangerous output. Suppress this.
688 $encValue = strtr( $encValue, array(
689 '<' => '<', // This should never happen,
690 '>' => '>', // we've received invalid input
691 '"' => '"', // which should have been escaped.
694 "''" => '''',
695 'ISBN' => 'ISBN',
697 'PMID' => 'PMID',
703 $encValue = preg_replace_callback(
704 '/(' . wfUrlProtocols() . ')/',
705 array( 'Sanitizer', 'armorLinksCallback' ),
711 * Given a value escape it so that it can be used in an id attribute and
712 * return it, this does not validate the value however (see first link)
714 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
717 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
724 static function escapeId( $id ) {
725 static $replace = array(
730 $id = urlencode( Sanitizer
::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
732 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
736 * Given a value, escape it so that it can be used as a CSS class and
739 * @todo For extra validity, input should be validated UTF-8.
741 * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
743 * @param string $class
746 static function escapeClass( $class ) {
747 // Convert ugly stuff to underscores and kill underscores in ugly places
748 return rtrim(preg_replace(
749 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
755 * Regex replace callback for armoring links against further processing.
756 * @param array $matches
760 private static function armorLinksCallback( $matches ) {
761 return str_replace( ':', ':', $matches[1] );
765 * Return an associative array of attribute names and values from
766 * a partial tag string. Attribute names are forces to lowercase,
767 * character references are decoded to UTF-8 text.
772 static function decodeTagAttributes( $text ) {
775 if( trim( $text ) == '' ) {
788 foreach( $pairs as $set ) {
789 $attribute = strtolower( $set[1] );
790 $value = Sanitizer
::getTagAttributeCallback( $set );
792 // Normalize whitespace
793 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
794 $value = trim( $value );
796 // Decode character references
797 $attribs[$attribute] = Sanitizer
::decodeCharReferences( $value );
803 * Pick the appropriate attribute value from a match set from the
804 * MW_ATTRIBS_REGEX matches.
810 private static function getTagAttributeCallback( $set ) {
811 if( isset( $set[6] ) ) {
812 # Illegal #XXXXXX color with no quotes.
814 } elseif( isset( $set[5] ) ) {
817 } elseif( isset( $set[4] ) ) {
820 } elseif( isset( $set[3] ) ) {
823 } elseif( !isset( $set[2] ) ) {
824 # In XHTML, attributes must have a value.
825 # For 'reduced' form, return explicitly the attribute name here.
828 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
833 * Normalize whitespace and character references in an XML source-
834 * encoded text for an attribute value.
836 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
837 * but note that we're not returning the value, but are returning
838 * XML source fragments that will be slapped into output.
840 * @param string $text
844 private static function normalizeAttributeValue( $text ) {
845 return str_replace( '"', '"',
847 '/\r\n|[\x20\x0d\x0a\x09]/',
849 Sanitizer
::normalizeCharReferences( $text ) ) );
853 * Ensure that any entities and character references are legal
854 * for XML and XHTML specifically. Any stray bits will be
855 * &-escaped to result in a valid text fragment.
857 * a. any named char refs must be known in XHTML
858 * b. any numeric char refs must be legal chars, not invalid or forbidden
859 * c. use &#x, not &#X
860 * d. fix or reject non-valid attributes
862 * @param string $text
866 static function normalizeCharReferences( $text ) {
867 return preg_replace_callback(
869 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
873 * @param string $matches
876 static function normalizeCharReferencesCallback( $matches ) {
878 if( $matches[1] != '' ) {
879 $ret = Sanitizer
::normalizeEntity( $matches[1] );
880 } elseif( $matches[2] != '' ) {
881 $ret = Sanitizer
::decCharReference( $matches[2] );
882 } elseif( $matches[3] != '' ) {
883 $ret = Sanitizer
::hexCharReference( $matches[3] );
884 } elseif( $matches[4] != '' ) {
885 $ret = Sanitizer
::hexCharReference( $matches[4] );
887 if( is_null( $ret ) ) {
888 return htmlspecialchars( $matches[0] );
895 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
896 * return the named entity reference as is. Otherwise, returns
897 * HTML-escaped text of pseudo-entity source (eg &foo;)
899 * @param string $name
903 static function normalizeEntity( $name ) {
904 global $wgHtmlEntities;
905 if( isset( $wgHtmlEntities[$name] ) ) {
908 return "&$name;";
912 static function decCharReference( $codepoint ) {
913 $point = intval( $codepoint );
914 if( Sanitizer
::validateCodepoint( $point ) ) {
915 return sprintf( '&#%d;', $point );
921 static function hexCharReference( $codepoint ) {
922 $point = hexdec( $codepoint );
923 if( Sanitizer
::validateCodepoint( $point ) ) {
924 return sprintf( '&#x%x;', $point );
931 * Returns true if a given Unicode codepoint is a valid character in XML.
932 * @param int $codepoint
935 private static function validateCodepoint( $codepoint ) {
936 return ($codepoint == 0x09)
937 ||
($codepoint == 0x0a)
938 ||
($codepoint == 0x0d)
939 ||
($codepoint >= 0x20 && $codepoint <= 0xd7ff)
940 ||
($codepoint >= 0xe000 && $codepoint <= 0xfffd)
941 ||
($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
945 * Decode any character references, numeric or named entities,
946 * in the text and return a UTF-8 string.
948 * @param string $text
953 public static function decodeCharReferences( $text ) {
954 return preg_replace_callback(
956 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
961 * @param string $matches
964 static function decodeCharReferencesCallback( $matches ) {
965 if( $matches[1] != '' ) {
966 return Sanitizer
::decodeEntity( $matches[1] );
967 } elseif( $matches[2] != '' ) {
968 return Sanitizer
::decodeChar( intval( $matches[2] ) );
969 } elseif( $matches[3] != '' ) {
970 return Sanitizer
::decodeChar( hexdec( $matches[3] ) );
971 } elseif( $matches[4] != '' ) {
972 return Sanitizer
::decodeChar( hexdec( $matches[4] ) );
974 # Last case should be an ampersand by itself
979 * Return UTF-8 string for a codepoint if that is a valid
980 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
981 * @param int $codepoint
985 static function decodeChar( $codepoint ) {
986 if( Sanitizer
::validateCodepoint( $codepoint ) ) {
987 return codepointToUtf8( $codepoint );
989 return UTF8_REPLACEMENT
;
994 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
995 * return the UTF-8 encoding of that character. Otherwise, returns
996 * pseudo-entity source (eg &foo;)
998 * @param string $name
1001 static function decodeEntity( $name ) {
1002 global $wgHtmlEntities;
1003 if( isset( $wgHtmlEntities[$name] ) ) {
1004 return codepointToUtf8( $wgHtmlEntities[$name] );
1011 * Fetch the whitelist of acceptable attributes for a given
1014 * @param string $element
1017 static function attributeWhitelist( $element ) {
1019 if( !isset( $list ) ) {
1020 $list = Sanitizer
::setupAttributeWhitelist();
1022 return isset( $list[$element] )
1028 * @todo Document it a bit
1031 static function setupAttributeWhitelist() {
1032 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1033 $block = array_merge( $common, array( 'align' ) );
1034 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1035 $tablecell = array( 'abbr',
1041 'nowrap', # deprecated
1042 'width', # deprecated
1043 'height', # deprecated
1044 'bgcolor' # deprecated
1047 # Numbers refer to sections in HTML 4.01 standard describing the element.
1048 # See: http://www.w3.org/TR/html4/
1049 $whitelist = array (
1052 'center' => $common, # deprecated
1053 'span' => $block, # ??
1071 'strong' => $common,
1082 'blockquote' => array_merge( $common, array( 'cite' ) ),
1093 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1096 'pre' => array_merge( $common, array( 'width' ) ),
1099 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1100 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1103 'ul' => array_merge( $common, array( 'type' ) ),
1104 'ol' => array_merge( $common, array( 'type', 'start' ) ),
1105 'li' => array_merge( $common, array( 'type', 'value' ) ),
1113 'table' => array_merge( $common,
1114 array( 'summary', 'width', 'border', 'frame',
1115 'rules', 'cellspacing', 'cellpadding',
1120 'caption' => array_merge( $common, array( 'align' ) ),
1123 'thead' => array_merge( $common, $tablealign ),
1124 'tfoot' => array_merge( $common, $tablealign ),
1125 'tbody' => array_merge( $common, $tablealign ),
1128 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1129 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1132 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1135 'td' => array_merge( $common, $tablecell, $tablealign ),
1136 'th' => array_merge( $common, $tablecell, $tablealign ),
1144 'strike' => $common,
1149 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1153 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1155 # XHTML Ruby annotation text module, simple ruby only.
1156 # http://www.w3c.org/TR/ruby/
1161 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1168 * Take a fragment of (potentially invalid) HTML and return
1169 * a version with any tags removed, encoded suitably for literal
1170 * inclusion in an attribute value.
1172 * @param string $text HTML fragment
1175 static function stripAllTags( $text ) {
1177 $text = StringUtils
::delimiterReplace( '<', '>', '', $text );
1179 # Normalize &entities and whitespace
1180 $text = Sanitizer
::normalizeAttributeValue( $text );
1182 # Will be placed into "double-quoted" attributes,
1183 # make sure remaining bits are safe.
1184 $text = str_replace(
1185 array('<', '>', '"'),
1186 array('<', '>', '"'),
1193 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1194 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1197 * Use for passing XHTML fragments to PHP's XML parsing functions
1202 static function hackDocType() {
1203 global $wgHtmlEntities;
1204 $out = "<!DOCTYPE html [\n";
1205 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1206 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1212 static function cleanUrl( $url, $hostname=true ) {
1213 # Normalize any HTML entities in input. They will be
1214 # re-escaped by makeExternalLink().
1215 $url = Sanitizer
::decodeCharReferences( $url );
1217 # Escape any control characters introduced by the above step
1218 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1220 # Validate hostname portion
1222 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1223 list( /* $whole */, $protocol, $host, $rest ) = $matches;
1225 // Characters that will be ignored in IDNs.
1226 // http://tools.ietf.org/html/3454#section-3.1
1227 // Strip them before further processing so blacklists and such work.
1229 \\s| # general whitespace
1230 \xc2\xad| # 00ad SOFT HYPHEN
1231 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1232 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1233 \xe2\x81\xa0| # 2060 WORD JOINER
1234 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1235 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1236 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1237 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1238 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1239 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1240 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1241 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1244 $host = preg_replace( $strip, '', $host );
1246 // @fixme: validate hostnames here
1248 return $protocol . $host . $rest;