<?php
-
/**
- * (X)HTML sanitizer for MediaWiki
+ * XHTML sanitizer for MediaWiki
*
* Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
* http://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
+ * the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @package MediaWiki
* Allows some... latitude.
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
*/
-$attrib = '[A-Za-z0-9]';
+$attrib = '[A-Za-z0-9]';
$space = '[\x09\x0a\x0d\x20]';
define( 'MW_ATTRIBS_REGEX',
"/(?:^|$space)($attrib+)
/**
* List of all named character entities defined in HTML 4.01
* http://www.w3.org/TR/html4/sgml/entities.html
- * @access private
+ * @private
*/
global $wgHtmlEntities;
$wgHtmlEntities = array(
'zwj' => 8205,
'zwnj' => 8204 );
+/** @package MediaWiki */
class Sanitizer {
/**
* Cleans up HTML, removes dangerous tags and attributes, and
* removes HTML comments
- * @access private
+ * @private
* @param string $text
+ * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
+ * @param array $args for the processing callback
* @return string
*/
- function removeHTMLtags( $text ) {
+ static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
global $wgUseTidy, $wgUserHtml;
- $fname = 'Parser::removeHTMLtags';
- wfProfileIn( $fname );
-
- if( $wgUserHtml ) {
- $htmlpairs = array( # Tags that must be closed
- 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
- 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
- 'strike', 'strong', 'tt', 'var', 'div', 'center',
- 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
- 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
- );
- $htmlsingle = array(
- 'br', 'hr', 'li', 'dt', 'dd'
- );
- $htmlnest = array( # Tags that can be nested--??
- 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
- 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
- );
- $tabletags = array( # Can only appear inside table
- 'td', 'th', 'tr'
- );
- } else {
- $htmlpairs = array();
- $htmlsingle = array();
- $htmlnest = array();
- $tabletags = array();
- }
- $htmlsingle = array_merge( $tabletags, $htmlsingle );
- $htmlelements = array_merge( $htmlsingle, $htmlpairs );
+ static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
+ $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
+
+ wfProfileIn( __METHOD__ );
+
+ if ( !$staticInitialised ) {
+ if( $wgUserHtml ) {
+ $htmlpairs = array( # Tags that must be closed
+ 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
+ 'strike', 'strong', 'tt', 'var', 'div', 'center',
+ 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
+ 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
+ );
+ $htmlsingle = array(
+ 'br', 'hr', 'li', 'dt', 'dd'
+ );
+ $htmlsingleonly = array( # Elements that cannot have close tags
+ 'br', 'hr'
+ );
+ $htmlnest = array( # Tags that can be nested--??
+ 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
+ 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
+ );
+ $tabletags = array( # Can only appear inside table
+ 'td', 'th', 'tr',
+ );
+ $htmllist = array( # Tags used by list
+ 'ul','ol',
+ );
+ $listtags = array( # Tags that can appear in a list
+ 'li',
+ );
+
+ } else {
+ $htmlpairs = array();
+ $htmlsingle = array();
+ $htmlnest = array();
+ $tabletags = array();
+ }
+
+ $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
+ $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
+
+ # Convert them all to hashtables for faster lookup
+ $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
+ 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
+ foreach ( $vars as $var ) {
+ $$var = array_flip( $$var );
+ }
+ $staticInitialised = true;
+ }
# Remove HTML comments
$text = Sanitizer::removeHTMLcomments( $text );
-
$bits = explode( '<', $text );
$text = array_shift( $bits );
if(!$wgUseTidy) {
- $tagstack = array(); $tablestack = array();
+ $tagstack = $tablestack = array();
foreach ( $bits as $x ) {
$prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
- preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
- $x, $regs );
+ preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs );
list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
error_reporting( $prev );
$badtag = 0 ;
- if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+ if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
# Check our stack
if ( $slash ) {
# Closing a tag...
- if ( ! in_array( $t, $htmlsingle ) &&
- ( $ot = @array_pop( $tagstack ) ) != $t ) {
- @array_push( $tagstack, $ot );
+ if( isset( $htmlsingleonly[$t] ) ) {
$badtag = 1;
+ } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
+ if ( isset( $htmlsingleallowed[$ot] ) ) {
+ # Pop all elements with an optional close tag
+ # and see if we find a match below them
+ $optstack = array();
+ array_push ($optstack, $ot);
+ while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
+ isset( $htmlsingleallowed[$ot] ) )
+ {
+ array_push ($optstack, $ot);
+ }
+ if ( $t != $ot ) {
+ # No match. Push the optinal elements back again
+ $badtag = 1;
+ while ( $ot = @array_pop( $optstack ) ) {
+ array_push( $tagstack, $ot );
+ }
+ }
+ } else {
+ @array_push( $tagstack, $ot );
+ # <li> can be nested in <ul> or <ol>, skip those cases:
+ if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
+ $badtag = 1;
+ }
+ }
} else {
if ( $t == 'table' ) {
$tagstack = array_pop( $tablestack );
}
- $newparams = '';
}
+ $newparams = '';
} else {
# Keep track for later
- if ( in_array( $t, $tabletags ) &&
+ if ( isset( $tabletags[$t] ) &&
! in_array( 'table', $tagstack ) ) {
$badtag = 1;
} else if ( in_array( $t, $tagstack ) &&
- ! in_array ( $t , $htmlnest ) ) {
+ ! isset( $htmlnest [$t ] ) ) {
$badtag = 1 ;
- } else if ( ! in_array( $t, $htmlsingle ) ) {
+ # Is it a self closed htmlpair ? (bug 5487)
+ } else if( $brace == '/>' &&
+ isset( $htmlpairs[$t] ) ) {
+ $badtag = 1;
+ } elseif( isset( $htmlsingleonly[$t] ) ) {
+ # Hack to force empty tag for uncloseable elements
+ $brace = '/>';
+ } else if( isset( $htmlsingle[$t] ) ) {
+ # Hack to not close $htmlsingle tags
+ $brace = NULL;
+ } else {
if ( $t == 'table' ) {
array_push( $tablestack, $tagstack );
$tagstack = array();
}
array_push( $tagstack, $t );
}
+
+ # Replace any variables or template parameters with
+ # plaintext results.
+ if( is_callable( $processCallback ) ) {
+ call_user_func_array( $processCallback, array( &$params, $args ) );
+ }
+
# Strip non-approved attributes from the tag
$newparams = Sanitizer::fixTagAttributes( $params, $t );
}
if ( ! $badtag ) {
$rest = str_replace( '>', '>', $rest );
- $text .= "<$slash$t$newparams$brace$rest";
+ $close = ( $brace == '/>' ) ? ' /' : '';
+ $text .= "<$slash$t$newparams$close>$rest";
continue;
}
}
} else {
# this might be possible using tidy itself
foreach ( $bits as $x ) {
- preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
+ preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
@list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
- if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+ if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
+ if( is_callable( $processCallback ) ) {
+ call_user_func_array( $processCallback, array( &$params, $args ) );
+ }
$newparams = Sanitizer::fixTagAttributes( $params, $t );
$rest = str_replace( '>', '>', $rest );
$text .= "<$slash$t$newparams$brace$rest";
}
}
}
- wfProfileOut( $fname );
+ wfProfileOut( __METHOD__ );
return $text;
}
* To avoid leaving blank lines, when a comment is both preceded
* and followed by a newline (ignoring spaces), trim leading and
* trailing spaces and one of the newlines.
- *
- * @access private
+ *
+ * @private
* @param string $text
* @return string
*/
- function removeHTMLcomments( $text ) {
- $fname='Parser::removeHTMLcomments';
- wfProfileIn( $fname );
+ static function removeHTMLcomments( $text ) {
+ wfProfileIn( __METHOD__ );
while (($start = strpos($text, '<!--')) !== false) {
$end = strpos($text, '-->', $start + 4);
if ($end === false) {
$text = substr_replace($text, '', $start, $end - $start);
}
}
- wfProfileOut( $fname );
+ wfProfileOut( __METHOD__ );
return $text;
}
+ /**
+ * Take an array of attribute names and values and normalize or discard
+ * illegal values for the given element type.
+ *
+ * - Discards attributes not on a whitelist for the given element
+ * - Unsafe style attributes are discarded
+ *
+ * @param array $attribs
+ * @param string $element
+ * @return array
+ *
+ * @todo Check for legal values where the DTD limits things.
+ * @todo Check for unique id attribute :P
+ */
+ static function validateTagAttributes( $attribs, $element ) {
+ $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
+ $out = array();
+ foreach( $attribs as $attribute => $value ) {
+ if( !isset( $whitelist[$attribute] ) ) {
+ continue;
+ }
+ # Strip javascript "expression" from stylesheets.
+ # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
+ if( $attribute == 'style' ) {
+ $value = Sanitizer::checkCss( $value );
+ if( $value === false ) {
+ # haxx0r
+ continue;
+ }
+ }
+
+ if ( $attribute === 'id' )
+ $value = Sanitizer::escapeId( $value );
+
+ // If this attribute was previously set, override it.
+ // Output should only have one attribute of each name.
+ $out[$attribute] = $value;
+ }
+ return $out;
+ }
+
+ /**
+ * Pick apart some CSS and check it for forbidden or unsafe structures.
+ * Returns a sanitized string, or false if it was just too evil.
+ *
+ * Currently URL references, 'expression', 'tps' are forbidden.
+ *
+ * @param string $value
+ * @return mixed
+ */
+ static function checkCss( $value ) {
+ $stripped = Sanitizer::decodeCharReferences( $value );
+
+ // Remove any comments; IE gets token splitting wrong
+ $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
+ $value = $stripped;
+
+ // ... and continue checks
+ $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
+ 'codepointToUtf8(hexdec("$1"))', $stripped );
+ $stripped = str_replace( '\\', '', $stripped );
+ if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
+ $stripped ) ) {
+ # haxx0r
+ return false;
+ }
+
+ return $value;
+ }
+
/**
* Take a tag soup fragment listing an HTML element's attributes
* and normalize it to well-formed XML, discarding unwanted attributes.
+ * Output is safe for further wikitext processing, with escaping of
+ * values that could trigger problems.
*
* - Normalizes attribute names to lowercase
* - Discards attributes not on a whitelist for the given element
* @param string $text
* @param string $element
* @return string
- *
- * @todo Check for legal values where the DTD limits things.
- * @todo Check for unique id attribute :P
*/
- function fixTagAttributes( $text, $element ) {
+ static function fixTagAttributes( $text, $element ) {
if( trim( $text ) == '' ) {
return '';
}
- # Unquoted attribute
- # Since we quote this later, this can be anything distinguishable
- # from the end of the attribute
- if( !preg_match_all(
- MW_ATTRIBS_REGEX,
- $text,
- $pairs,
- PREG_SET_ORDER ) ) {
- return '';
- }
-
- $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
+ $stripped = Sanitizer::validateTagAttributes(
+ Sanitizer::decodeTagAttributes( $text ), $element );
+
$attribs = array();
- foreach( $pairs as $set ) {
- $attribute = strtolower( $set[1] );
- if( !isset( $whitelist[$attribute] ) ) {
- continue;
- }
-
- $raw = Sanitizer::getTagAttributeCallback( $set );
- $value = Sanitizer::normalizeAttributeValue( $raw );
-
- # Strip javascript "expression" from stylesheets.
- # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
- if( $attribute == 'style' && preg_match(
- '/(expression|tps*:\/\/|url\\s*\().*/is',
- Sanitizer::decodeCharReferences( $value ) ) ) {
- # haxx0r
- continue;
- }
+ foreach( $stripped as $attribute => $value ) {
+ $encAttribute = htmlspecialchars( $attribute );
+ $encValue = Sanitizer::safeEncodeAttribute( $value );
- if( !isset( $attribs[$attribute] ) ) {
- $attribs[$attribute] = "$attribute=\"$value\"";
- }
- }
- if( empty( $attribs ) ) {
- return '';
- } else {
- return ' ' . implode( ' ', $attribs );
+ $attribs[] = "$encAttribute=\"$encValue\"";
}
+ return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+ }
+
+ /**
+ * Encode an attribute value for HTML output.
+ * @param $text
+ * @return HTML-encoded text fragment
+ */
+ static function encodeAttribute( $text ) {
+ $encValue = htmlspecialchars( $text );
+
+ // Whitespace is normalized during attribute decoding,
+ // so if we've been passed non-spaces we must encode them
+ // ahead of time or they won't be preserved.
+ $encValue = strtr( $encValue, array(
+ "\n" => ' ',
+ "\r" => ' ',
+ "\t" => '	',
+ ) );
+
+ return $encValue;
}
+ /**
+ * Encode an attribute value for HTML tags, with extra armoring
+ * against further wiki processing.
+ * @param $text
+ * @return HTML-encoded text fragment
+ */
+ static function safeEncodeAttribute( $text ) {
+ $encValue = Sanitizer::encodeAttribute( $text );
+
+ # Templates and links may be expanded in later parsing,
+ # creating invalid or dangerous output. Suppress this.
+ $encValue = strtr( $encValue, array(
+ '<' => '<', // This should never happen,
+ '>' => '>', // we've received invalid input
+ '"' => '"', // which should have been escaped.
+ '{' => '{',
+ '[' => '[',
+ "''" => '''',
+ 'ISBN' => 'ISBN',
+ 'RFC' => 'RFC',
+ 'PMID' => 'PMID',
+ '|' => '|',
+ '__' => '__',
+ ) );
+
+ # Stupid hack
+ $encValue = preg_replace_callback(
+ '/(' . wfUrlProtocols() . ')/',
+ array( 'Sanitizer', 'armorLinksCallback' ),
+ $encValue );
+ return $encValue;
+ }
+
+ /**
+ * Given a value escape it so that it can be used in an id attribute and
+ * return it, this does not validate the value however (see first link)
+ *
+ * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
+ * in the id and
+ * name attributes
+ * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+ *
+ * @bug 4461
+ *
+ * @static
+ *
+ * @param string $id
+ * @return string
+ */
+ static function escapeId( $id ) {
+ static $replace = array(
+ '%3A' => ':',
+ '%' => '.'
+ );
+
+ $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+
+ return str_replace( array_keys( $replace ), array_values( $replace ), $id );
+ }
+
+ /**
+ * Regex replace callback for armoring links against further processing.
+ * @param array $matches
+ * @return string
+ * @private
+ */
+ private static function armorLinksCallback( $matches ) {
+ return str_replace( ':', ':', $matches[1] );
+ }
+
/**
* Return an associative array of attribute names and values from
* a partial tag string. Attribute names are forces to lowercase,
* @param string
* @return array
*/
- function decodeTagAttributes( $text ) {
+ static function decodeTagAttributes( $text ) {
$attribs = array();
-
+
if( trim( $text ) == '' ) {
return $attribs;
}
-
+
+ $pairs = array();
if( !preg_match_all(
MW_ATTRIBS_REGEX,
$text,
foreach( $pairs as $set ) {
$attribute = strtolower( $set[1] );
$value = Sanitizer::getTagAttributeCallback( $set );
+
+ // Normalize whitespace
+ $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
+ $value = trim( $value );
+
+ // Decode character references
$attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
}
return $attribs;
}
-
+
/**
* Pick the appropriate attribute value from a match set from the
* MW_ATTRIBS_REGEX matches.
*
* @param array $set
* @return string
- * @access private
+ * @private
*/
- function getTagAttributeCallback( $set ) {
+ private static function getTagAttributeCallback( $set ) {
if( isset( $set[6] ) ) {
# Illegal #XXXXXX color with no quotes.
return $set[6];
# For 'reduced' form, return explicitly the attribute name here.
return $set[1];
} else {
- wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
+ throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
}
}
-
+
/**
* Normalize whitespace and character references in an XML source-
* encoded text for an attribute value.
*
* @param string $text
* @return string
- * @access private
+ * @private
*/
- function normalizeAttributeValue( $text ) {
+ private static function normalizeAttributeValue( $text ) {
return str_replace( '"', '"',
preg_replace(
'/\r\n|[\x20\x0d\x0a\x09]/',
' ',
Sanitizer::normalizeCharReferences( $text ) ) );
}
-
+
/**
* Ensure that any entities and character references are legal
* for XML and XHTML specifically. Any stray bits will be
*
* @param string $text
* @return string
- * @access private
+ * @private
*/
- function normalizeCharReferences( $text ) {
+ static function normalizeCharReferences( $text ) {
return preg_replace_callback(
MW_CHAR_REFS_REGEX,
array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
* @param string $matches
* @return string
*/
- function normalizeCharReferencesCallback( $matches ) {
+ static function normalizeCharReferencesCallback( $matches ) {
$ret = null;
if( $matches[1] != '' ) {
$ret = Sanitizer::normalizeEntity( $matches[1] );
return $ret;
}
}
-
+
/**
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
* return the named entity reference as is. Otherwise, returns
*
* @param string $name
* @return string
+ * @static
*/
- function normalizeEntity( $name ) {
+ static function normalizeEntity( $name ) {
global $wgHtmlEntities;
if( isset( $wgHtmlEntities[$name] ) ) {
return "&$name;";
return "&$name;";
}
}
-
- function decCharReference( $codepoint ) {
- $point = IntVal( $codepoint );
+
+ static function decCharReference( $codepoint ) {
+ $point = intval( $codepoint );
if( Sanitizer::validateCodepoint( $point ) ) {
return sprintf( '&#%d;', $point );
} else {
return null;
}
}
-
- function hexCharReference( $codepoint ) {
+
+ static function hexCharReference( $codepoint ) {
$point = hexdec( $codepoint );
if( Sanitizer::validateCodepoint( $point ) ) {
return sprintf( '&#x%x;', $point );
return null;
}
}
-
+
/**
* Returns true if a given Unicode codepoint is a valid character in XML.
* @param int $codepoint
* @return bool
*/
- function validateCodepoint( $codepoint ) {
+ private static function validateCodepoint( $codepoint ) {
return ($codepoint == 0x09)
|| ($codepoint == 0x0a)
|| ($codepoint == 0x0d)
*
* @param string $text
* @return string
- * @access public
+ * @public
+ * @static
*/
- function decodeCharReferences( $text ) {
+ public static function decodeCharReferences( $text ) {
return preg_replace_callback(
MW_CHAR_REFS_REGEX,
array( 'Sanitizer', 'decodeCharReferencesCallback' ),
$text );
}
-
+
/**
* @param string $matches
* @return string
*/
- function decodeCharReferencesCallback( $matches ) {
+ static function decodeCharReferencesCallback( $matches ) {
if( $matches[1] != '' ) {
return Sanitizer::decodeEntity( $matches[1] );
} elseif( $matches[2] != '' ) {
# Last case should be an ampersand by itself
return $matches[0];
}
-
+
/**
* Return UTF-8 string for a codepoint if that is a valid
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
* @param int $codepoint
* @return string
- * @access private
+ * @private
*/
- function decodeChar( $codepoint ) {
+ static function decodeChar( $codepoint ) {
if( Sanitizer::validateCodepoint( $codepoint ) ) {
return codepointToUtf8( $codepoint );
} else {
return UTF8_REPLACEMENT;
}
}
-
+
/**
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
* return the UTF-8 encoding of that character. Otherwise, returns
* @param string $name
* @return string
*/
- function decodeEntity( $name ) {
+ static function decodeEntity( $name ) {
global $wgHtmlEntities;
if( isset( $wgHtmlEntities[$name] ) ) {
return codepointToUtf8( $wgHtmlEntities[$name] );
return "&$name;";
}
}
-
+
/**
* Fetch the whitelist of acceptable attributes for a given
* element name.
* @param string $element
* @return array
*/
- function attributeWhitelist( $element ) {
+ static function attributeWhitelist( $element ) {
static $list;
if( !isset( $list ) ) {
$list = Sanitizer::setupAttributeWhitelist();
? $list[$element]
: array();
}
-
+
/**
+ * @todo Document it a bit
* @return array
*/
- function setupAttributeWhitelist() {
+ static function setupAttributeWhitelist() {
$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
$block = array_merge( $common, array( 'align' ) );
$tablealign = array( 'align', 'char', 'charoff', 'valign' );
'height', # deprecated
'bgcolor' # deprecated
);
-
+
# Numbers refer to sections in HTML 4.01 standard describing the element.
# See: http://www.w3.org/TR/html4/
$whitelist = array (
'div' => $block,
'center' => $common, # deprecated
'span' => $block, # ??
-
+
# 7.5.5
'h1' => $block,
'h2' => $block,
'h4' => $block,
'h5' => $block,
'h6' => $block,
-
+
# 7.5.6
# address
-
+
# 8.2.4
# bdo
-
+
# 9.2.1
'em' => $common,
'strong' => $common,
'var' => $common,
# abbr
# acronym
-
+
# 9.2.2
'blockquote' => array_merge( $common, array( 'cite' ) ),
# q
-
+
# 9.2.3
'sub' => $common,
'sup' => $common,
-
+
# 9.3.1
'p' => $block,
-
+
# 9.3.2
'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
-
+
# 9.3.4
'pre' => array_merge( $common, array( 'width' ) ),
-
+
# 9.4
'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
-
+
# 10.2
'ul' => array_merge( $common, array( 'type' ) ),
'ol' => array_merge( $common, array( 'type', 'start' ) ),
'li' => array_merge( $common, array( 'type', 'value' ) ),
-
+
# 10.3
'dl' => $common,
'dd' => $common,
'dt' => $common,
-
+
# 11.2.1
'table' => array_merge( $common,
array( 'summary', 'width', 'border', 'frame',
- 'rules', 'cellspacing', 'cellpadding',
- 'align', 'bgcolor', 'frame', 'rules',
- 'border' ) ),
-
+ 'rules', 'cellspacing', 'cellpadding',
+ 'align', 'bgcolor',
+ ) ),
+
# 11.2.2
'caption' => array_merge( $common, array( 'align' ) ),
-
+
# 11.2.3
'thead' => array_merge( $common, $tablealign ),
'tfoot' => array_merge( $common, $tablealign ),
'tbody' => array_merge( $common, $tablealign ),
-
+
# 11.2.4
'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
-
+
# 11.2.5
'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
-
+
# 11.2.6
'td' => array_merge( $common, $tablecell, $tablealign ),
'th' => array_merge( $common, $tablecell, $tablealign ),
-
+
# 15.2.1
'tt' => $common,
'b' => $common,
'strike' => $common,
's' => $common,
'u' => $common,
-
+
# 15.2.2
'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
# basefont
-
+
# 15.3
'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
-
+
# XHTML Ruby annotation text module, simple ruby only.
# http://www.w3c.org/TR/ruby/
'ruby' => $common,
);
return $whitelist;
}
-
+
/**
* Take a fragment of (potentially invalid) HTML and return
* a version with any tags removed, encoded suitably for literal
* @param string $text HTML fragment
* @return string
*/
- function stripAllTags( $text ) {
+ static function stripAllTags( $text ) {
# Actual <tags>
- $text = preg_replace( '/<[^>]*>/', '', $text );
-
+ $text = preg_replace( '/ < .*? > /x', '', $text );
+
# Normalize &entities and whitespace
$text = Sanitizer::normalizeAttributeValue( $text );
-
+
# Will be placed into "double-quoted" attributes,
# make sure remaining bits are safe.
$text = str_replace(
array('<', '>', '"'),
array('<', '>', '"'),
$text );
-
+
return $text;
}
+ /**
+ * Hack up a private DOCTYPE with HTML's standard entity declarations.
+ * PHP 4 seemed to know these if you gave it an HTML doctype, but
+ * PHP 5.1 doesn't.
+ *
+ * Use for passing XHTML fragments to PHP's XML parsing functions
+ *
+ * @return string
+ * @static
+ */
+ static function hackDocType() {
+ global $wgHtmlEntities;
+ $out = "<!DOCTYPE html [\n";
+ foreach( $wgHtmlEntities as $entity => $codepoint ) {
+ $out .= "<!ENTITY $entity \"&#$codepoint;\">";
+ }
+ $out .= "]>\n";
+ return $out;
+ }
+
+ static function cleanUrl( $url, $hostname=true ) {
+ # Normalize any HTML entities in input. They will be
+ # re-escaped by makeExternalLink().
+ $url = Sanitizer::decodeCharReferences( $url );
+
+ # Escape any control characters introduced by the above step
+ $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+
+ # Validate hostname portion
+ if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
+ list( $whole, $protocol, $host, $rest ) = $matches;
+
+ // Characters that will be ignored in IDNs.
+ // http://tools.ietf.org/html/3454#section-3.1
+ // Strip them before further processing so blacklists and such work.
+ $strip = "/
+ \\s| # general whitespace
+ \xc2\xad| # 00ad SOFT HYPHEN
+ \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
+ \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
+ \xe2\x81\xa0| # 2060 WORD JOINER
+ \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
+ \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
+ \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+ \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+ \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+ \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
+ \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
+ [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
+ /xuD";
+
+ $host = preg_replace( $strip, '', $host );
+
+ // @fixme: validate hostnames here
+
+ return $protocol . $host . $rest;
+ } else {
+ return $url;
+ }
+ }
+
}
?>