X-Git-Url: https://git.cyclocoop.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=fb386102e6fa4a570db2e807e790f457886064f8;hb=fea110db5bd2339222f8d2ce1514b3de3c2bdde6;hp=27b17ce9f958c37ab1766579dec1a3a56c492e35;hpb=7bb50c630a6b760c0cdc7662c44f8c3607954a19;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 27b17ce9f9..fb386102e6 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -33,18 +33,27 @@ class Sanitizer { * Regular expression to match various types of character references in * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences */ - const CHAR_REFS_REGEX = + const CHAR_REFS_REGEX = '/&([A-Za-z0-9\x80-\xff]+); |&\#([0-9]+); |&\#[xX]([0-9A-Fa-f]+); |(&)/x'; + /** + * Blacklist for evil uris like javascript: + * WARNING: DO NOT use this in any place that actually requires blacklisting + * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the + * only way to be secure from javascript: uri based xss vectors is to whitelist + * things that you know are safe and deny everything else. + * [1]: http://ha.ckers.org/xss.html + */ const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; /** * List of all named character entities defined in HTML 4.01 * http://www.w3.org/TR/html4/sgml/entities.html + * As well as ' which is only defined starting in XHTML1. * @private */ static $htmlEntities = array( @@ -63,6 +72,7 @@ class Sanitizer { 'amp' => 38, 'and' => 8743, 'ang' => 8736, + 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 'Aring' => 197, 'aring' => 229, 'asymp' => 8776, @@ -325,7 +335,7 @@ class Sanitizer { $attribFirst = '[:A-Z_a-z0-9]'; $attrib = '[:A-Z_a-z-.0-9]'; $space = '[\x09\x0a\x0d\x20]'; - self::$attribsRegex = + self::$attribsRegex = "/(?:^|$space)({$attribFirst}{$attrib}*) ($space*=$space* (?: @@ -447,16 +457,26 @@ class Sanitizer { # and see if we find a match below them $optstack = array(); array_push( $optstack, $ot ); - $ot = @array_pop( $tagstack ); + wfSuppressWarnings(); + $ot = array_pop( $tagstack ); + wfRestoreWarnings(); while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { array_push( $optstack, $ot ); - $ot = @array_pop( $tagstack ); + wfSuppressWarnings(); + $ot = array_pop( $tagstack ); + wfRestoreWarnings(); } if ( $t != $ot ) { # No match. Push the optional elements back again $badtag = true; - while ( $ot = @array_pop( $optstack ) ) { + wfSuppressWarnings(); + $ot = array_pop( $optstack ); + wfRestoreWarnings(); + while ( $ot ) { array_push( $tagstack, $ot ); + wfSuppressWarnings(); + $ot = array_pop( $optstack ); + wfRestoreWarnings(); } } } else { @@ -592,6 +612,102 @@ class Sanitizer { return $text; } + /** + * Take an array of attribute names and values and fix some deprecated values + * for the given element type. + * This does not validate properties, so you should ensure that you call + * validateTagAttributes AFTER this to ensure that the resulting style rule + * this may add is safe. + * + * - Converts most presentational attributes like align into inline css + * + * @param $attribs Array + * @param $element String + * @return Array + */ + static function fixDeprecatedAttributes( $attribs, $element ) { + global $wgHtml5, $wgCleanupPresentationalAttributes; + + // presentational attributes were removed from html5, we can leave them + // in when html5 is turned off + if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) { + return $attribs; + } + + $table = array( 'table' ); + $cells = array( 'td', 'th' ); + $colls = array( 'col', 'colgroup' ); + $tblocks = array( 'tbody', 'tfoot', 'thead' ); + $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ); + + $presentationalAttribs = array( + 'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ), + 'clear' => array( 'clear', array( 'br' ) ), + 'height' => array( 'height', $cells ), + 'nowrap' => array( 'white-space', $cells ), + 'size' => array( 'height', array( 'hr' ) ), + 'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ), + 'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ), + 'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ), + ); + + // Ensure that any upper case or mixed case attributes are converted to lowercase + foreach ( $attribs as $attribute => $value ) { + if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) { + $attribs[strtolower( $attribute )] = $value; + unset( $attribs[$attribute] ); + } + } + + $style = ""; + foreach ( $presentationalAttribs as $attribute => $info ) { + list( $property, $elements ) = $info; + + // Skip if this attribute is not relevant to this element + if ( !in_array( $element, $elements ) ) { + continue; + } + + // Skip if the attribute is not used + if ( !array_key_exists( $attribute, $attribs ) ) { + continue; + } + + $value = $attribs[$attribute]; + + // For nowrap the value should be nowrap instead of whatever text is in the value + if ( $attribute === 'nowrap' ) { + $value = 'nowrap'; + } + + // clear="all" is clear: both; in css + if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) { + $value = 'both'; + } + + // Size based properties should have px applied to them if they have no unit + if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) { + if ( preg_match( '/^[\d.]+$/', $value ) ) { + $value = "{$value}px"; + } + } + + $style .= " $property: $value;"; + + unset( $attribs[$attribute] ); + } + + if ( $style ) { + // Prepend our style rules so that they can be overridden by user css + if ( isset($attribs['style']) ) { + $style .= " " . $attribs['style']; + } + $attribs['style'] = trim($style); + } + + return $attribs; + } + /** * Take an array of attribute names and values and normalize or discard * illegal values for the given element type. @@ -660,7 +776,7 @@ class Sanitizer { } //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity - if ( $attribute === 'rel' || $attribute === 'rev' || + if ( $attribute === 'rel' || $attribute === 'rev' || $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa $attribute === 'datatype' || $attribute === 'typeof' || #RDFa $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata @@ -668,7 +784,7 @@ class Sanitizer { //Paranoia. Allow "simple" values but suppress javascript if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { - continue; + continue; } } @@ -722,34 +838,40 @@ class Sanitizer { /** * Pick apart some CSS and check it for forbidden or unsafe structures. - * Returns a sanitized string, or false if it was just too evil. + * Returns a sanitized string. This sanitized string will have + * character references and escape sequences decoded, and comments + * stripped. If the input is just too evil, only a comment complaining + * about evilness will be returned. * * Currently URL references, 'expression', 'tps' are forbidden. * + * NOTE: Despite the fact that character references are decoded, the + * returned string may contain character references given certain + * clever input strings. These character references must + * be escaped before the return value is embedded in HTML. + * * @param $value String - * @return Mixed + * @return String */ static function checkCss( $value ) { + // Decode character references like { $value = Sanitizer::decodeCharReferences( $value ); - // Remove any comments; IE gets token splitting wrong - $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); - - // Remove anything after a comment-start token, to guard against - // incorrect client implementations. - $commentPos = strpos( $value, '/*' ); - if ( $commentPos !== false ) { - $value = substr( $value, 0, $commentPos ); - } - // Decode escape sequences and line continuation // See the grammar in the CSS 2 spec, appendix D. + // This has to be done AFTER decoding character references. + // This means it isn't possible for this function to return + // unsanitized escape sequences. It is possible to manufacture + // input that contains character references that decode to + // escape sequences that decode to character references, but + // it's OK for the return value to contain character references + // because the caller is supposed to escape those anyway. static $decodeRegex; if ( !$decodeRegex ) { $space = '[\\x20\\t\\r\\n\\f]'; $nl = '(?:\\n|\\r\\n|\\r|\\f)'; $backslash = '\\\\'; - $decodeRegex = "/ $backslash + $decodeRegex = "/ $backslash (?: ($nl) | # 1. Line continuation ([0-9A-Fa-f]{1,6})$space? | # 2. character number @@ -760,6 +882,21 @@ class Sanitizer { $value = preg_replace_callback( $decodeRegex, array( __CLASS__, 'cssDecodeCallback' ), $value ); + // Remove any comments; IE gets token splitting wrong + // This must be done AFTER decoding character references and + // escape sequences, because those steps can introduce comments + // This step cannot introduce character references or escape + // sequences, because it replaces comments with spaces rather + // than removing them completely. + $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); + + // Remove anything after a comment-start token, to guard against + // incorrect client implementations. + $commentPos = strpos( $value, '/*' ); + if ( $commentPos !== false ) { + $value = substr( $value, 0, $commentPos ); + } + // Reject problematic keywords and control characters if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) { return '/* invalid control char */'; @@ -769,6 +906,10 @@ class Sanitizer { return $value; } + /** + * @param $matches array + * @return String + */ static function cssDecodeCallback( $matches ) { if ( $matches[1] !== '' ) { // Line continuation @@ -814,8 +955,9 @@ class Sanitizer { return ''; } - $stripped = Sanitizer::validateTagAttributes( - Sanitizer::decodeTagAttributes( $text ), $element ); + $decoded = Sanitizer::decodeTagAttributes( $text ); + $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element ); + $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); $attribs = array(); foreach( $stripped as $attribute => $value ) { @@ -1070,6 +1212,10 @@ class Sanitizer { Sanitizer::normalizeCharReferences( $text ) ) ); } + /** + * @param $text string + * @return mixed + */ private static function normalizeWhitespace( $text ) { return preg_replace( '/\r\n|[\x20\x0d\x0a\x09]/', @@ -1153,6 +1299,10 @@ class Sanitizer { } } + /** + * @param $codepoint + * @return null|string + */ static function decCharReference( $codepoint ) { $point = intval( $codepoint ); if( Sanitizer::validateCodepoint( $point ) ) { @@ -1162,6 +1312,10 @@ class Sanitizer { } } + /** + * @param $codepoint + * @return null|string + */ static function hexCharReference( $codepoint ) { $point = hexdec( $codepoint ); if( Sanitizer::validateCodepoint( $point ) ) { @@ -1259,7 +1413,7 @@ class Sanitizer { * return the UTF-8 encoding of that character. Otherwise, returns * pseudo-entity source (eg &foo;) * - * @param $name Strings + * @param $name String * @return String */ static function decodeEntity( $name ) { @@ -1302,7 +1456,7 @@ class Sanitizer { if ( $wgAllowRdfaAttributes ) { #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 $common = array_merge( $common, array( - 'about', 'property', 'resource', 'datatype', 'typeof', + 'about', 'property', 'resource', 'datatype', 'typeof', ) ); } @@ -1419,7 +1573,7 @@ class Sanitizer { 'th' => array_merge( $common, $tablecell, $tablealign ), # 12.2 # NOTE: is not allowed directly, but the attrib whitelist is used from the Parser object - 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa + 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa # 13.2 # Not usually allowed, but may be used for extension-style hooks @@ -1500,13 +1654,18 @@ class Sanitizer { return $out; } + /** + * @param $url string + * @return mixed|string + */ static function cleanUrl( $url ) { # Normalize any HTML entities in input. They will be # re-escaped by makeExternalLink(). $url = Sanitizer::decodeCharReferences( $url ); # Escape any control characters introduced by the above step - $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url ); + $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', + array( __CLASS__, 'cleanUrlCallback' ), $url ); # Validate hostname portion $matches = array(); @@ -1534,7 +1693,7 @@ class Sanitizer { $host = preg_replace( $strip, '', $host ); - // @todo Fixme: validate hostnames here + // @todo FIXME: Validate hostnames here return $protocol . $host . $rest; } else { @@ -1542,4 +1701,63 @@ class Sanitizer { } } + /** + * @param $matches array + * @return string + */ + static function cleanUrlCallback( $matches ) { + return urlencode( $matches[0] ); + } + + /** + * Does a string look like an e-mail address? + * + * This validates an email address using an HTML5 specification found at: + * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address + * Which as of 2011-01-24 says: + * + * A valid e-mail address is a string that matches the ABNF production + * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined + * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section + * 3.5. + * + * This function is an implementation of the specification as requested in + * bug 22449. + * + * Client-side forms will use the same standard validation rules via JS or + * HTML 5 validation; additional restrictions can be enforced server-side + * by extensions via the 'isValidEmailAddr' hook. + * + * Note that this validation doesn't 100% match RFC 2822, but is believed + * to be liberal enough for wide use. Some invalid addresses will still + * pass validation here. + * + * @since 1.18 + * + * @param $addr String E-mail address + * @return Bool + */ + public static function validateEmail( $addr ) { + $result = null; + if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { + return $result; + } + + // Please note strings below are enclosed in brackets [], this make the + // hyphen "-" a range indicator. Hence it is double backslashed below. + // See bug 26948 + $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ; + $rfc1034_ldh_str = "a-z0-9\\-" ; + + $HTML5_email_regexp = "/ + ^ # start of string + [$rfc5322_atext\\.]+ # user part which is liberal :p + @ # 'apostrophe' + [$rfc1034_ldh_str]+ # First domain part + (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot + $ # End of string + /ix" ; // case Insensitive, eXtended + + return (bool) preg_match( $HTML5_email_regexp, $addr ); + } }