X-Git-Url: https://git.cyclocoop.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=fb386102e6fa4a570db2e807e790f457886064f8;hb=fea110db5bd2339222f8d2ce1514b3de3c2bdde6;hp=27b17ce9f958c37ab1766579dec1a3a56c492e35;hpb=7bb50c630a6b760c0cdc7662c44f8c3607954a19;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 27b17ce9f9..fb386102e6 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -33,18 +33,27 @@ class Sanitizer {
 	 * Regular expression to match various types of character references in
 	 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
 	 */
-	const CHAR_REFS_REGEX = 
+	const CHAR_REFS_REGEX =
 		'/&([A-Za-z0-9\x80-\xff]+);
 		 |&\#([0-9]+);
 		 |&\#[xX]([0-9A-Fa-f]+);
 		 |(&)/x';
 
+	/**
+	 * Blacklist for evil uris like javascript:
+	 * WARNING: DO NOT use this in any place that actually requires blacklisting
+	 * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
+	 * only way to be secure from javascript: uri based xss vectors is to whitelist
+	 * things that you know are safe and deny everything else.
+	 * [1]: http://ha.ckers.org/xss.html
+	 */
 	const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
 	const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
 
 	/**
 	 * List of all named character entities defined in HTML 4.01
 	 * http://www.w3.org/TR/html4/sgml/entities.html
+	 * As well as &apos; which is only defined starting in XHTML1.
 	 * @private
 	 */
 	static $htmlEntities = array(
@@ -63,6 +72,7 @@ class Sanitizer {
 		'amp'      => 38,
 		'and'      => 8743,
 		'ang'      => 8736,
+		'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
 		'Aring'    => 197,
 		'aring'    => 229,
 		'asymp'    => 8776,
@@ -325,7 +335,7 @@ class Sanitizer {
 			$attribFirst = '[:A-Z_a-z0-9]';
 			$attrib = '[:A-Z_a-z-.0-9]';
 			$space = '[\x09\x0a\x0d\x20]';
-			self::$attribsRegex = 
+			self::$attribsRegex =
 				"/(?:^|$space)({$attribFirst}{$attrib}*)
 				  ($space*=$space*
 					(?:
@@ -447,16 +457,26 @@ class Sanitizer {
 								# and see if we find a match below them
 								$optstack = array();
 								array_push( $optstack, $ot );
-								$ot = @array_pop( $tagstack );
+								wfSuppressWarnings();
+								$ot = array_pop( $tagstack );
+								wfRestoreWarnings();
 								while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
 									array_push( $optstack, $ot );
-									$ot = @array_pop( $tagstack );
+									wfSuppressWarnings();
+									$ot = array_pop( $tagstack );
+									wfRestoreWarnings();
 								}
 								if ( $t != $ot ) {
 									# No match. Push the optional elements back again
 									$badtag = true;
-									while ( $ot = @array_pop( $optstack ) ) {
+									wfSuppressWarnings();
+									$ot = array_pop( $optstack );
+									wfRestoreWarnings();
+									while ( $ot ) {
 										array_push( $tagstack, $ot );
+										wfSuppressWarnings();
+										$ot = array_pop( $optstack );
+										wfRestoreWarnings();
 									}
 								}
 							} else {
@@ -592,6 +612,102 @@ class Sanitizer {
 		return $text;
 	}
 
+	/**
+	 * Take an array of attribute names and values and fix some deprecated values
+	 * for the given element type.
+	 * This does not validate properties, so you should ensure that you call
+	 * validateTagAttributes AFTER this to ensure that the resulting style rule
+	 * this may add is safe.
+	 *
+	 * - Converts most presentational attributes like align into inline css
+	 *
+	 * @param $attribs Array
+	 * @param $element String
+	 * @return Array
+	 */
+	static function fixDeprecatedAttributes( $attribs, $element ) {
+		global $wgHtml5, $wgCleanupPresentationalAttributes;
+
+		// presentational attributes were removed from html5, we can leave them
+		// in when html5 is turned off
+		if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
+			return $attribs;
+		}
+
+		$table = array( 'table' );
+		$cells = array( 'td', 'th' );
+		$colls = array( 'col', 'colgroup' );
+		$tblocks = array( 'tbody', 'tfoot', 'thead' );
+		$h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
+
+		$presentationalAttribs = array(
+			'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
+			'clear' => array( 'clear', array( 'br' ) ),
+			'height' => array( 'height', $cells ),
+			'nowrap' => array( 'white-space', $cells ),
+			'size' => array( 'height', array( 'hr' ) ),
+			'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
+			'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
+			'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
+		);
+
+		// Ensure that any upper case or mixed case attributes are converted to lowercase
+		foreach ( $attribs as $attribute => $value ) {
+			if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) {
+				$attribs[strtolower( $attribute )] = $value;
+				unset( $attribs[$attribute] );
+			}
+		}
+
+		$style = "";
+		foreach ( $presentationalAttribs as $attribute => $info ) {
+			list( $property, $elements ) = $info;
+
+			// Skip if this attribute is not relevant to this element
+			if ( !in_array( $element, $elements ) ) {
+				continue;
+			}
+
+			// Skip if the attribute is not used
+			if ( !array_key_exists( $attribute, $attribs ) ) {
+				continue;
+			}
+
+			$value = $attribs[$attribute];
+
+			// For nowrap the value should be nowrap instead of whatever text is in the value
+			if ( $attribute === 'nowrap' ) {
+				$value = 'nowrap';
+			}
+
+			// clear="all" is clear: both; in css
+			if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) {
+				$value = 'both';
+			}
+
+			// Size based properties should have px applied to them if they have no unit
+			if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
+				if ( preg_match( '/^[\d.]+$/', $value ) ) {
+					$value = "{$value}px";
+				}
+			}
+
+			$style .= " $property: $value;";
+
+			unset( $attribs[$attribute] );
+		}
+
+		if ( $style ) {
+			// Prepend our style rules so that they can be overridden by user css
+			if ( isset($attribs['style']) ) {
+				$style .= " " . $attribs['style'];
+			}
+			$attribs['style'] = trim($style);
+		}
+
+		return $attribs;
+	}
+
 	/**
 	 * Take an array of attribute names and values and normalize or discard
 	 * illegal values for the given element type.
@@ -660,7 +776,7 @@ class Sanitizer {
 			}
 
 			//RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
-			if ( $attribute === 'rel' || $attribute === 'rev' || 
+			if ( $attribute === 'rel' || $attribute === 'rev' ||
 				$attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
 				$attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
 				$attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
@@ -668,7 +784,7 @@ class Sanitizer {
 
 				//Paranoia. Allow "simple" values but suppress javascript
 				if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
-					continue; 
+					continue;
 				}
 			}
 
@@ -722,34 +838,40 @@ class Sanitizer {
 
 	/**
 	 * Pick apart some CSS and check it for forbidden or unsafe structures.
-	 * Returns a sanitized string, or false if it was just too evil.
+	 * Returns a sanitized string. This sanitized string will have
+	 * character references and escape sequences decoded, and comments
+	 * stripped. If the input is just too evil, only a comment complaining
+	 * about evilness will be returned.
 	 *
 	 * Currently URL references, 'expression', 'tps' are forbidden.
 	 *
+	 * NOTE: Despite the fact that character references are decoded, the
+	 * returned string may contain character references given certain
+	 * clever input strings. These character references must
+	 * be escaped before the return value is embedded in HTML.
+	 *
 	 * @param $value String
-	 * @return Mixed
+	 * @return String
 	 */
 	static function checkCss( $value ) {
+		// Decode character references like &#123;
 		$value = Sanitizer::decodeCharReferences( $value );
 
-		// Remove any comments; IE gets token splitting wrong
-		$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
-		// Remove anything after a comment-start token, to guard against
-		// incorrect client implementations.
-		$commentPos = strpos( $value, '/*' );
-		if ( $commentPos !== false ) {
-			$value = substr( $value, 0, $commentPos );
-		}
-
 		// Decode escape sequences and line continuation
 		// See the grammar in the CSS 2 spec, appendix D.
+		// This has to be done AFTER decoding character references.
+		// This means it isn't possible for this function to return
+		// unsanitized escape sequences. It is possible to manufacture
+		// input that contains character references that decode to
+		// escape sequences that decode to character references, but
+		// it's OK for the return value to contain character references
+		// because the caller is supposed to escape those anyway.
 		static $decodeRegex;
 		if ( !$decodeRegex ) {
 			$space = '[\\x20\\t\\r\\n\\f]';
 			$nl = '(?:\\n|\\r\\n|\\r|\\f)';
 			$backslash = '\\\\';
-			$decodeRegex = "/ $backslash 
+			$decodeRegex = "/ $backslash
 				(?:
 					($nl) |  # 1. Line continuation
 					([0-9A-Fa-f]{1,6})$space? |  # 2. character number
@@ -760,6 +882,21 @@ class Sanitizer {
 		$value = preg_replace_callback( $decodeRegex,
 			array( __CLASS__, 'cssDecodeCallback' ), $value );
 
+		// Remove any comments; IE gets token splitting wrong
+		// This must be done AFTER decoding character references and
+		// escape sequences, because those steps can introduce comments
+		// This step cannot introduce character references or escape
+		// sequences, because it replaces comments with spaces rather
+		// than removing them completely.
+		$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+		// Remove anything after a comment-start token, to guard against
+		// incorrect client implementations.
+		$commentPos = strpos( $value, '/*' );
+		if ( $commentPos !== false ) {
+			$value = substr( $value, 0, $commentPos );
+		}
+
 		// Reject problematic keywords and control characters
 		if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
 			return '/* invalid control char */';
@@ -769,6 +906,10 @@ class Sanitizer {
 		return $value;
 	}
 
+	/**
+	 * @param $matches array
+	 * @return String
+	 */
 	static function cssDecodeCallback( $matches ) {
 		if ( $matches[1] !== '' ) {
 			// Line continuation
@@ -814,8 +955,9 @@ class Sanitizer {
 			return '';
 		}
 
-		$stripped = Sanitizer::validateTagAttributes(
-			Sanitizer::decodeTagAttributes( $text ), $element );
+		$decoded = Sanitizer::decodeTagAttributes( $text );
+		$decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
+		$stripped = Sanitizer::validateTagAttributes( $decoded, $element );
 
 		$attribs = array();
 		foreach( $stripped as $attribute => $value ) {
@@ -1070,6 +1212,10 @@ class Sanitizer {
 				Sanitizer::normalizeCharReferences( $text ) ) );
 	}
 
+	/**
+	 * @param $text string
+	 * @return mixed
+	 */
 	private static function normalizeWhitespace( $text ) {
 		return preg_replace(
 			'/\r\n|[\x20\x0d\x0a\x09]/',
@@ -1153,6 +1299,10 @@ class Sanitizer {
 		}
 	}
 
+	/**
+	 * @param $codepoint
+	 * @return null|string
+	 */
 	static function decCharReference( $codepoint ) {
 		$point = intval( $codepoint );
 		if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1162,6 +1312,10 @@ class Sanitizer {
 		}
 	}
 
+	/**
+	 * @param $codepoint
+	 * @return null|string
+	 */
 	static function hexCharReference( $codepoint ) {
 		$point = hexdec( $codepoint );
 		if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1259,7 +1413,7 @@ class Sanitizer {
 	 * return the UTF-8 encoding of that character. Otherwise, returns
 	 * pseudo-entity source (eg &foo;)
 	 *
-	 * @param $name Strings
+	 * @param $name String
 	 * @return String
 	 */
 	static function decodeEntity( $name ) {
@@ -1302,7 +1456,7 @@ class Sanitizer {
 		if ( $wgAllowRdfaAttributes ) {
 			#RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
 			$common = array_merge( $common, array(
-			    'about', 'property', 'resource', 'datatype', 'typeof', 
+			    'about', 'property', 'resource', 'datatype', 'typeof',
 			) );
 		}
 
@@ -1419,7 +1573,7 @@ class Sanitizer {
 			'th'         => array_merge( $common, $tablecell, $tablealign ),
 
 			# 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
-			'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 
+			'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
 
 			# 13.2
 			# Not usually allowed, but may be used for extension-style hooks
@@ -1500,13 +1654,18 @@ class Sanitizer {
 		return $out;
 	}
 
+	/**
+	 * @param $url string
+	 * @return mixed|string
+	 */
 	static function cleanUrl( $url ) {
 		# Normalize any HTML entities in input. They will be
 		# re-escaped by makeExternalLink().
 		$url = Sanitizer::decodeCharReferences( $url );
 
 		# Escape any control characters introduced by the above step
-		$url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url );
+		$url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
+			array( __CLASS__, 'cleanUrlCallback' ), $url );
 
 		# Validate hostname portion
 		$matches = array();
@@ -1534,7 +1693,7 @@ class Sanitizer {
 
 			$host = preg_replace( $strip, '', $host );
 
-			// @todo Fixme: validate hostnames here
+			// @todo FIXME: Validate hostnames here
 
 			return $protocol . $host . $rest;
 		} else {
@@ -1542,4 +1701,63 @@ class Sanitizer {
 		}
 	}
 
+	/**
+	 * @param $matches array
+	 * @return string
+	 */
+	static function cleanUrlCallback( $matches ) {
+		return urlencode( $matches[0] );
+	}
+
+	/**
+	 * Does a string look like an e-mail address?
+	 *
+	 * This validates an email address using an HTML5 specification found at:
+	 * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+	 * Which as of 2011-01-24 says:
+	 *
+	 *   A valid e-mail address is a string that matches the ABNF production
+	 *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
+	 *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
+	 *   3.5.
+	 *
+	 * This function is an implementation of the specification as requested in
+	 * bug 22449.
+	 *
+	 * Client-side forms will use the same standard validation rules via JS or
+	 * HTML 5 validation; additional restrictions can be enforced server-side
+	 * by extensions via the 'isValidEmailAddr' hook.
+	 *
+	 * Note that this validation doesn't 100% match RFC 2822, but is believed
+	 * to be liberal enough for wide use. Some invalid addresses will still
+	 * pass validation here.
+	 *
+	 * @since 1.18
+	 *
+	 * @param $addr String E-mail address
+	 * @return Bool
+	 */
+	public static function validateEmail( $addr ) {
+		$result = null;
+		if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+			return $result;
+		}
+
+		// Please note strings below are enclosed in brackets [], this make the
+		// hyphen "-" a range indicator. Hence it is double backslashed below.
+		// See bug 26948
+		$rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
+		$rfc1034_ldh_str = "a-z0-9\\-" ;
+
+		$HTML5_email_regexp = "/
+		^                      # start of string
+		[$rfc5322_atext\\.]+    # user part which is liberal :p
+		@                      # 'apostrophe'
+		[$rfc1034_ldh_str]+       # First domain part
+		(\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
+		$                      # End of string
+		/ix" ; // case Insensitive, eXtended
+
+		return (bool) preg_match( $HTML5_email_regexp, $addr );
+	}
 }