$attrib = '[A-Za-z0-9]';
$space = '[\x09\x0a\x0d\x20]';
define( 'MW_ATTRIBS_REGEX',
- "/(?:^|$space)($attrib+)
+ "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
($space*=$space*
(?:
# The attribute value: quoted or alone
)
)?(?=$space|\$)/sx" );
+/**
+ * Regular expression to match URIs that could trigger script execution
+ */
+define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
+
+/**
+ * Regular expression to match namespace attributes
+ */
+define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
+
/**
* List of all named character entities defined in HTML 4.01
* http://www.w3.org/TR/html4/sgml/entities.html
if ( !$staticInitialised ) {
$htmlpairsStatic = array( # Tags that must be closed
- 'a', 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+ 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
$text = Sanitizer::removeHTMLcomments( $text );
$bits = explode( '<', $text );
$text = str_replace( '>', '>', array_shift( $bits ) );
- if(!$wgUseTidy) {
+ if ( !$wgUseTidy ) {
$tagstack = $tablestack = array();
foreach ( $bits as $x ) {
$regs = array();
+ # $slash: Does the current element start with a '/'?
+ # $t: Current element name
+ # $params: String between element name and >
+ # $brace: Ending '>' or '/>'
+ # $rest: Everything until the next element of $bits
if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
} else {
$slash = $t = $params = $brace = $rest = null;
}
- $badtag = 0 ;
+ $badtag = false;
if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
# Check our stack
- if ( $slash ) {
- # Closing a tag...
- if( isset( $htmlsingleonly[$t] ) ) {
- $badtag = 1;
- } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
+ if ( $slash && isset( $htmlsingleonly[$t] ) ) {
+ $badtag = true;
+ } elseif ( $slash ) {
+ # Closing a tag... is it the one we just opened?
+ $ot = @array_pop( $tagstack );
+ if ( $ot != $t ) {
if ( isset( $htmlsingleallowed[$ot] ) ) {
# Pop all elements with an optional close tag
# and see if we find a match below them
$optstack = array();
- array_push ($optstack, $ot);
- while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
- isset( $htmlsingleallowed[$ot] ) )
- {
- array_push ($optstack, $ot);
+ array_push( $optstack, $ot );
+ $ot = @array_pop( $tagstack );
+ while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
+ array_push( $optstack, $ot );
+ $ot = @array_pop( $tagstack );
}
if ( $t != $ot ) {
- # No match. Push the optinal elements back again
- $badtag = 1;
+ # No match. Push the optional elements back again
+ $badtag = true;
while ( $ot = @array_pop( $optstack ) ) {
array_push( $tagstack, $ot );
}
} else {
@array_push( $tagstack, $ot );
# <li> can be nested in <ul> or <ol>, skip those cases:
- if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
- $badtag = 1;
+ if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
+ $badtag = true;
}
}
} else {
} else {
# Keep track for later
if ( isset( $tabletags[$t] ) &&
- ! in_array( 'table', $tagstack ) ) {
- $badtag = 1;
- } else if ( in_array( $t, $tagstack ) &&
- ! isset( $htmlnest [$t ] ) ) {
- $badtag = 1 ;
+ !in_array( 'table', $tagstack ) ) {
+ $badtag = true;
+ } elseif ( in_array( $t, $tagstack ) &&
+ !isset( $htmlnest [$t ] ) ) {
+ $badtag = true;
# Is it a self closed htmlpair ? (bug 5487)
- } else if( $brace == '/>' &&
+ } elseif ( $brace == '/>' &&
isset( $htmlpairs[$t] ) ) {
- $badtag = 1;
- } elseif( isset( $htmlsingleonly[$t] ) ) {
+ $badtag = true;
+ } elseif ( isset( $htmlsingleonly[$t] ) ) {
# Hack to force empty tag for uncloseable elements
$brace = '/>';
- } else if( isset( $htmlsingle[$t] ) ) {
+ } elseif ( isset( $htmlsingle[$t] ) ) {
# Hack to not close $htmlsingle tags
- $brace = NULL;
- } else if( isset( $tabletags[$t] )
- && in_array($t ,$tagstack) ) {
+ $brace = null;
+ } elseif ( isset( $tabletags[$t] )
+ && in_array( $t, $tagstack ) ) {
// New table tag but forgot to close the previous one
$text .= "</$t>";
} else {
# Strip non-approved attributes from the tag
$newparams = Sanitizer::fixTagAttributes( $params, $t );
}
- if ( ! $badtag ) {
+ if ( !$badtag ) {
$rest = str_replace( '>', '>', $rest );
$close = ( $brace == '/>' && !$slash ) ? ' /' : '';
$text .= "<$slash$t$newparams$close>$rest";
* @todo Check for unique id attribute :P
*/
static function validateAttributes( $attribs, $whitelist ) {
+ global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
+
$whitelist = array_flip( $whitelist );
$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
$out = array();
foreach( $attribs as $attribute => $value ) {
+ #allow XML namespace declaration if RDFa is enabled
+ if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
+ if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+ $out[$attribute] = $value;
+ }
+
+ continue;
+ }
+
if( !isset( $whitelist[$attribute] ) ) {
continue;
}
+
# Strip javascript "expression" from stylesheets.
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
if( $attribute == 'style' ) {
$wgEnforceHtmlIds ? 'noninitial' : 'xml' );
}
+ //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
+ if ( $attribute === 'rel' || $attribute === 'rev' ||
+ $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
+ $attribute === 'datatype' || $attribute === 'typeof' || #RDFa
+ $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
+ $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata
+
+ //Paranoia. Allow "simple" values but suppress javascript
+ if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+ continue;
+ }
+ }
+
+ # NOTE: even though elements using href/src are not allowed directly, supply
+ # validation code that can be used by tag hook handlers, etc
if ( $attribute === 'href' || $attribute === 'src' ) {
if ( !preg_match( $hrefExp, $value ) ) {
continue; //drop any href or src attributes not using an allowed protocol.
}
}
- //RDFa properties allow URIs. check them
- if ( $attribute === 'rel' || $attribute === 'rev' ||
- $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' ||
- $attribute === 'datatype' || $attribute === 'typeof' ) {
- //Paranoia. Allow "simple" values but suppress javascript
- if ( preg_match( '/(^|\s)javascript\s*:/i', $value ) ) {
- continue;
- }
- }
-
// If this attribute was previously set, override it.
// Output should only have one attribute of each name.
$out[$attribute] = $value;
}
+
+ if ( $wgAllowMicrodataAttributes ) {
+ # There are some complicated validity constraints we need to
+ # enforce here. First of all, we don't want to allow non-standard
+ # itemtypes.
+ $allowedTypes = array(
+ 'http://microformats.org/profile/hcard',
+ 'http://microformats.org/profile/hcalendar#vevent',
+ 'http://n.whatwg.org/work',
+ );
+ if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
+ $allowedTypes ) ) {
+ # Kill everything
+ unset( $out['itemscope'] );
+ }
+ # itemtype, itemid, itemref don't make sense without itemscope
+ if ( !array_key_exists( 'itemscope', $out ) ) {
+ unset( $out['itemtype'] );
+ unset( $out['itemid'] );
+ unset( $out['itemref'] );
+ }
+ # TODO: Strip itemprop if we aren't descendants of an itemscope.
+ }
return $out;
}
* @return Array
*/
public static function decodeTagAttributes( $text ) {
- $attribs = array();
-
if( trim( $text ) == '' ) {
- return $attribs;
+ return array();
}
+ $attribs = array();
$pairs = array();
if( !preg_match_all(
MW_ATTRIBS_REGEX,
* @return Array
*/
static function setupAttributeWhitelist() {
- $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style',
- #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
- 'about', 'property', 'resource', 'datatype', 'typeof',
- );
+ global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
+
+ $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style', 'xml:lang' );
+
+ if ( $wgAllowRdfaAttributes ) {
+ #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+ $common = array_merge( $common, array(
+ 'about', 'property', 'resource', 'datatype', 'typeof',
+ ) );
+ }
+
+ if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+ # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+ $common = array_merge( $common, array(
+ 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
+ ) );
+ }
$block = array_merge( $common, array( 'align' ) );
$tablealign = array( 'align', 'char', 'charoff', 'valign' );
'td' => array_merge( $common, $tablecell, $tablealign ),
'th' => array_merge( $common, $tablecell, $tablealign ),
- # 12.2
+ # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
# 13.2
$host = preg_replace( $strip, '', $host );
- // @fixme: validate hostnames here
+ // @todo Fixme: validate hostnames here
return $protocol . $host . $rest;
} else {