/**
* XHTML sanitizer for MediaWiki
*
- * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
+ * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
* http://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
define( 'MW_CHAR_REFS_REGEX',
'/&([A-Za-z0-9\x80-\xff]+);
|&\#([0-9]+);
- |&\#x([0-9A-Za-z]+);
- |&\#X([0-9A-Za-z]+);
+ |&\#[xX]([0-9A-Fa-f]+);
|(&)/x' );
/**
* Allows some... latitude.
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
*/
-$attrib = '[A-Za-z0-9]';
+$attribFirst = '[:A-Z_a-z0-9]';
+$attrib = '[:A-Z_a-z-.0-9]';
$space = '[\x09\x0a\x0d\x20]';
define( 'MW_ATTRIBS_REGEX',
- "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
+ "/(?:^|$space)({$attribFirst}{$attrib}*)
($space*=$space*
(?:
# The attribute value: quoted or alone
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
- 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
+ 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
+ 'kbd', 'samp'
);
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
* @todo Check for unique id attribute :P
*/
static function validateAttributes( $attribs, $whitelist ) {
- global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
+ global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
$whitelist = array_flip( $whitelist );
$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
continue;
}
- if( !isset( $whitelist[$attribute] ) ) {
+ # Allow any attribute beginning with "data-", if in HTML5 mode
+ if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
continue;
}
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
if( $attribute == 'style' ) {
$value = Sanitizer::checkCss( $value );
- if( $value === false ) {
- # haxx0r
- continue;
- }
}
if ( $attribute === 'id' ) {
}
if ( $wgAllowMicrodataAttributes ) {
- # There are some complicated validity constraints we need to
- # enforce here. First of all, we don't want to allow non-standard
- # itemtypes.
- $allowedTypes = array(
- 'http://microformats.org/profile/hcard',
- 'http://microformats.org/profile/hcalendar#vevent',
- 'http://n.whatwg.org/work',
- );
- if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
- $allowedTypes ) ) {
- # Kill everything
- unset( $out['itemscope'] );
- }
# itemtype, itemid, itemref don't make sense without itemscope
if ( !array_key_exists( 'itemscope', $out ) ) {
unset( $out['itemtype'] );
// Remove any comments; IE gets token splitting wrong
$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+ // Remove anything after a comment-start token, to guard against
+ // incorrect client implementations.
+ $commentPos = strpos( $value, '/*' );
+ if ( $commentPos !== false ) {
+ $value = substr( $value, 0, $commentPos );
+ }
+
// Decode escape sequences and line continuation
- // See the grammar in the CSS 2 spec, appendix D, Mozilla implements it accurately.
- // IE 8 doesn't implement it at all, but there's no way to introduce url() into
- // IE that doesn't hit Mozilla also.
+ // See the grammar in the CSS 2 spec, appendix D.
static $decodeRegex;
if ( !$decodeRegex ) {
$space = '[\\x20\\t\\r\\n\\f]';
(?:
($nl) | # 1. Line continuation
([0-9A-Fa-f]{1,6})$space? | # 2. character number
- (.) # 3. backslash cancelling special meaning
+ (.) | # 3. backslash cancelling special meaning
+ () | # 4. backslash at end of string
)/xu";
}
- $decoded = preg_replace_callback( $decodeRegex,
+ $value = preg_replace_callback( $decodeRegex,
array( __CLASS__, 'cssDecodeCallback' ), $value );
- if ( preg_match( '!expression|https?://|url\s*\(!i', $decoded ) ) {
- // Not allowed
- return false;
- } else {
- // Allowed, return CSS with comments stripped
- return $value;
+
+ // Reject problematic keywords and control characters
+ if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
+ return '/* invalid control char */';
+ } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( !ix', $value ) ) {
+ return '/* insecure input */';
}
+ return $value;
}
static function cssDecodeCallback( $matches ) {
if ( $matches[1] !== '' ) {
+ // Line continuation
return '';
} elseif ( $matches[2] !== '' ) {
- return codepointToUtf8( hexdec( $matches[2] ) );
+ $char = codepointToUtf8( hexdec( $matches[2] ) );
} elseif ( $matches[3] !== '' ) {
- return $matches[3];
+ $char = $matches[3];
} else {
- throw new MWException( __METHOD__.': invalid match' );
+ $char = '\\';
+ }
+ if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
+ // These characters need to be escaped in strings
+ // Clean up the escape sequence to avoid parsing errors by clients
+ return '\\' . dechex( ord( $char ) ) . ' ';
+ } else {
+ // Decode unnecessary escape
+ return $char;
}
}
*
* To ensure we don't have to bother escaping anything, we also strip ', ",
* & even if $wgExperimentalIds is true. TODO: Is this the best tactic?
- * We also strip # because it upsets IE6.
+ * We also strip # because it upsets IE, and % because it could be
+ * ambiguous if it's part of something that looks like a percent escape
+ * (which don't work reliably in fragments cross-browser).
*
* @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
* in the id and
if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
$id = Sanitizer::decodeCharReferences( $id );
- $id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id );
+ $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
$id = trim( $id, '_' );
if ( $id === '' ) {
# Must have been all whitespace to start with.
/**
* Given HTML input, escape with htmlspecialchars but un-escape entites.
- * This allows (generally harmless) entities like to survive.
+ * This allows (generally harmless) entities like   to survive.
*
* @param $html String to escape
* @return String: escaped input
*/
static function escapeHtmlAllowEntities( $html ) {
+ $html = Sanitizer::decodeCharReferences( $html );
# It seems wise to escape ' as well as ", as a matter of course. Can't
# hurt.
$html = htmlspecialchars( $html, ENT_QUOTES );
- $html = str_replace( '&', '&', $html );
- $html = Sanitizer::normalizeCharReferences( $html );
return $html;
}
$text );
}
+ /**
+ * Normalizes whitespace in a section name, such as might be returned
+ * by Parser::stripSectionName(), for use in the id's that are used for
+ * section links.
+ *
+ * @param $section String
+ * @return String
+ */
+ static function normalizeSectionNameWhitespace( $section ) {
+ return trim( preg_replace( '/[ _]+/', ' ', $section ) );
+ }
+
/**
* Ensure that any entities and character references are legal
* for XML and XHTML specifically. Any stray bits will be
* &-escaped to result in a valid text fragment.
*
- * a. any named char refs must be known in XHTML
+ * a. named char refs can only be < > & ", others are
+ * numericized (this way we're well-formed even without a DTD)
* b. any numeric char refs must be legal chars, not invalid or forbidden
* c. use &#x, not &#X
* d. fix or reject non-valid attributes
$ret = Sanitizer::decCharReference( $matches[2] );
} elseif( $matches[3] != '' ) {
$ret = Sanitizer::hexCharReference( $matches[3] );
- } elseif( $matches[4] != '' ) {
- $ret = Sanitizer::hexCharReference( $matches[4] );
}
if( is_null( $ret ) ) {
return htmlspecialchars( $matches[0] );
/**
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
- * return the named entity reference as is. If the entity is a
- * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
- * returns HTML-escaped text of pseudo-entity source (eg &foo;)
+ * return the equivalent numeric entity reference (except for the core <
+ * > & "). If the entity is a MediaWiki-specific alias, returns
+ * the HTML equivalent. Otherwise, returns HTML-escaped text of
+ * pseudo-entity source (eg &foo;)
*
* @param $name String
* @return String
global $wgHtmlEntities, $wgHtmlEntityAliases;
if ( isset( $wgHtmlEntityAliases[$name] ) ) {
return "&{$wgHtmlEntityAliases[$name]};";
- } elseif( isset( $wgHtmlEntities[$name] ) ) {
+ } elseif ( in_array( $name,
+ array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
return "&$name;";
+ } elseif ( isset( $wgHtmlEntities[$name] ) ) {
+ return "&#{$wgHtmlEntities[$name]};";
} else {
return "&$name;";
}
return Sanitizer::decodeChar( intval( $matches[2] ) );
} elseif( $matches[3] != '' ) {
return Sanitizer::decodeChar( hexdec( $matches[3] ) );
- } elseif( $matches[4] != '' ) {
- return Sanitizer::decodeChar( hexdec( $matches[4] ) );
}
# Last case should be an ampersand by itself
return $matches[0];
'em' => $common,
'strong' => $common,
'cite' => $common,
- # dfn
+ 'dfn' => $common,
'code' => $common,
- # samp
- # kbd
+ 'samp' => $common,
+ 'kbd' => $common,
'var' => $common,
'abbr' => $common,
# acronym