From 45b6f3ca352fcea546e73b70b299590dccc51ae2 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Sun, 6 Feb 2005 06:44:48 +0000
Subject: [PATCH] Split the HTML sanitizer functions from the Parser monolith

---
 includes/Parser.php    | 228 ++------------------------------------
 includes/Sanitizer.php | 245 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 252 insertions(+), 221 deletions(-)
 create mode 100644 includes/Sanitizer.php
diff --git a/includes/Parser.php b/includes/Parser.php
index dbc770f703..855429c05a 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -6,6 +6,8 @@
  * @package MediaWiki
  */
 
+require_once( 'Sanitizer.php' );
+
 /**
  * Update this version number when the ParserOutput format
  * changes in an incompatible way, so the parser cache
@@ -467,57 +469,6 @@ class Parser
 		return $rnd;
 	}
 
-	/**
-	 * Return allowed HTML attributes
-	 *
-	 * @access private
-	 */
-	function getHTMLattrs () {
-		$htmlattrs = array( # Allowed attributes--no scripting, etc.
-				'title', 'align', 'lang', 'dir', 'width', 'height',
-				'bgcolor', 'clear', /* BR */ 'noshade', /* HR */
-				'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color',
-				/* FONT */ 'type', 'start', 'value', 'compact',
-				/* For various lists, mostly deprecated but safe */
-				'summary', 'width', 'border', 'frame', 'rules',
-				'cellspacing', 'cellpadding', 'valign', 'char',
-				'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis',
-				'headers', 'scope', 'rowspan', 'colspan', /* Tables */
-				'id', 'class', 'name', 'style' /* For CSS */
-				);
-		return $htmlattrs ;
-	}
-
-	/**
-	 * Remove non approved attributes and javascript in css
-	 *
-	 * @access private
-	 */
-	function fixTagAttributes ( $t ) {
-		if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-)
-		$htmlattrs = $this->getHTMLattrs() ;
-
-		# Strip non-approved attributes from the tag
-		$t = preg_replace(
-			'/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e',
-			"(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
-			$t);
-
-		$t = str_replace ( '<></>' , '' , $t ) ; # This should fix bug 980557
-
-		# Strip javascript "expression" from stylesheets. Brute force approach:
-		# If anythin offensive is found, all attributes of the HTML tag are dropped
-
-		if( preg_match(
-			'/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is',
-			wfMungeToUtf8( $t ) ) )
-		{
-			$t='';
-		}
-
-		return trim ( $t ) ;
-	}
-
 	/**
 	 * interface with html tidy, used if $wgUseTidy = true
 	 *
@@ -595,7 +546,7 @@ class Parser
 				$indent_level = strlen( $matches[1] );
 				$t[$k] = "\n" .
 					str_repeat( '<dl><dd>', $indent_level ) .
-					'<table ' . $this->fixTagAttributes ( $matches[2] ) . '>' ;
+					'<table ' . Sanitizer::fixTagAttributes ( $matches[2] ) . '>' ;
 				array_push ( $td , false ) ;
 				array_push ( $ltd , '' ) ;
 				array_push ( $tr , false ) ;
@@ -622,7 +573,7 @@ class Parser
 				array_push ( $tr , false ) ;
 				array_push ( $td , false ) ;
 				array_push ( $ltd , '' ) ;
-				array_push ( $ltr , $this->fixTagAttributes ( $x ) ) ;
+				array_push ( $ltr , Sanitizer::fixTagAttributes ( $x ) ) ;
 			}
 			else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption
 				# $x is a table row
@@ -664,7 +615,7 @@ class Parser
 					}
 					if ( count ( $y ) == 1 )
 						$y = "{$z}<{$l}>{$y[0]}" ;
-					else $y = $y = "{$z}<{$l} ".$this->fixTagAttributes($y[0]).">{$y[1]}" ;
+					else $y = $y = "{$z}<{$l} ".Sanitizer::fixTagAttributes($y[0]).">{$y[1]}" ;
 					$t[$k] .= $y ;
 					array_push ( $td , true ) ;
 				}
@@ -697,7 +648,7 @@ class Parser
 		$fname = 'Parser::internalParse';
 		wfProfileIn( $fname );
 
-		$text = $this->removeHTMLtags( $text );
+		$text = Sanitizer::removeHTMLtags( $text );
 		$text = $this->replaceVariables( $text, $args );
 
 		$text = preg_replace( '/(^|\n)-----*/', '\\1<hr />', $text );
@@ -2118,7 +2069,7 @@ class Parser
 			$this->mTemplatePath[$part1] = 1;
 
 			$text = $this->strip( $text, $this->mStripState );
-			$text = $this->removeHTMLtags( $text );
+			$text = Sanitizer::removeHTMLtags( $text );
 			$text = $this->replaceVariables( $text, $assocArgs );
 
 			# Resume the link cache and register the inclusion as a link
@@ -2210,171 +2161,6 @@ class Parser
 		}
 	}
 
-
-	/**
-	 * Cleans up HTML, removes dangerous tags and attributes, and
-	 * removes HTML comments
-	 * @access private
-	 */
-	function removeHTMLtags( $text ) {
-		global $wgUseTidy, $wgUserHtml;
-		$fname = 'Parser::removeHTMLtags';
-		wfProfileIn( $fname );
-
-		if( $wgUserHtml ) {
-			$htmlpairs = array( # Tags that must be closed
-				'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
-				'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
-				'strike', 'strong', 'tt', 'var', 'div', 'center',
-				'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
-				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
-			);
-			$htmlsingle = array(
-				'br', 'hr', 'li', 'dt', 'dd'
-			);
-			$htmlnest = array( # Tags that can be nested--??
-				'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
-				'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
-			);
-			$tabletags = array( # Can only appear inside table
-				'td', 'th', 'tr'
-			);
-		} else {
-			$htmlpairs = array();
-			$htmlsingle = array();
-			$htmlnest = array();
-			$tabletags = array();
-		}
-
-		$htmlsingle = array_merge( $tabletags, $htmlsingle );
-		$htmlelements = array_merge( $htmlsingle, $htmlpairs );
-
-		$htmlattrs = $this->getHTMLattrs () ;
-
-		# Remove HTML comments
-		$text = $this->removeHTMLcomments( $text );
-
-		$bits = explode( '<', $text );
-		$text = array_shift( $bits );
-		if(!$wgUseTidy) {
-			$tagstack = array(); $tablestack = array();
-			foreach ( $bits as $x ) {
-				$prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
-				preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
-				$x, $regs );
-				list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
-				error_reporting( $prev );
-
-				$badtag = 0 ;
-				if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
-					# Check our stack
-					if ( $slash ) {
-						# Closing a tag...
-						if ( ! in_array( $t, $htmlsingle ) &&
-						( $ot = @array_pop( $tagstack ) ) != $t ) {
-							@array_push( $tagstack, $ot );
-							$badtag = 1;
-						} else {
-							if ( $t == 'table' ) {
-								$tagstack = array_pop( $tablestack );
-							}
-							$newparams = '';
-						}
-					} else {
-						# Keep track for later
-						if ( in_array( $t, $tabletags ) &&
-						! in_array( 'table', $tagstack ) ) {
-							$badtag = 1;
-						} else if ( in_array( $t, $tagstack ) &&
-						! in_array ( $t , $htmlnest ) ) {
-							$badtag = 1 ;
-						} else if ( ! in_array( $t, $htmlsingle ) ) {
-							if ( $t == 'table' ) {
-								array_push( $tablestack, $tagstack );
-								$tagstack = array();
-							}
-							array_push( $tagstack, $t );
-						}
-						# Strip non-approved attributes from the tag
-						$newparams = $this->fixTagAttributes($params);
-
-					}
-					if ( ! $badtag ) {
-						$rest = str_replace( '>', '&gt;', $rest );
-						$text .= "<$slash$t $newparams$brace$rest";
-						continue;
-					}
-				}
-				$text .= '&lt;' . str_replace( '>', '&gt;', $x);
-			}
-			# Close off any remaining tags
-			while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
-				$text .= "</$t>\n";
-				if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
-			}
-		} else {
-			# this might be possible using tidy itself
-			foreach ( $bits as $x ) {
-				preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
-				$x, $regs );
-				@list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
-				if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
-					$newparams = $this->fixTagAttributes($params);
-					$rest = str_replace( '>', '&gt;', $rest );
-					$text .= "<$slash$t $newparams$brace$rest";
-				} else {
-					$text .= '&lt;' . str_replace( '>', '&gt;', $x);
-				}
-			}
-		}
-		wfProfileOut( $fname );
-		return $text;
-	}
-
-	/**
-	 * Remove '<!--', '-->', and everything between.
-	 * To avoid leaving blank lines, when a comment is both preceded
-	 * and followed by a newline (ignoring spaces), trim leading and
-	 * trailing spaces and one of the newlines.
-	 * 
-	 * @access private
-	 */
-	function removeHTMLcomments( $text ) {
-		$fname='Parser::removeHTMLcomments';
-		wfProfileIn( $fname );
-		while (($start = strpos($text, '<!--')) !== false) {
-			$end = strpos($text, '-->', $start + 4);
-			if ($end === false) {
-				# Unterminated comment; bail out
-				break;
-			}
-
-			$end += 3;
-
-			# Trim space and newline if the comment is both
-			# preceded and followed by a newline
-			$spaceStart = max($start - 1, 0);
-			$spaceLen = $end - $spaceStart;
-			while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
-				$spaceStart--;
-				$spaceLen++;
-			}
-			while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
-				$spaceLen++;
-			if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
-				# Remove the comment, leading and trailing
-				# spaces, and leave only one newline.
-				$text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
-			}
-			else {
-				# Remove just the comment.
-				$text = substr_replace($text, '', $start, $end - $start);
-			}
-		}
-		wfProfileOut( $fname );
-		return $text;
-	}
-
 	/**
 	 * This function accomplishes several tasks:
 	 * 1) Auto-number headings if that option is enabled
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
new file mode 100644
index 0000000000..dab3c8ce4f
--- /dev/null
+++ b/includes/Sanitizer.php
@@ -0,0 +1,245 @@
+<?php
+
+/**
+ * (X)HTML sanitizer for MediaWiki
+ *
+ * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
+ * http://www.mediawiki.org/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or 
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @package MediaWiki
+ */
+
+class Sanitizer {
+	/**
+	 * Cleans up HTML, removes dangerous tags and attributes, and
+	 * removes HTML comments
+	 * @access private
+	 */
+	function removeHTMLtags( $text ) {
+		global $wgUseTidy, $wgUserHtml;
+		$fname = 'Parser::removeHTMLtags';
+		wfProfileIn( $fname );
+
+		if( $wgUserHtml ) {
+			$htmlpairs = array( # Tags that must be closed
+				'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+				'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
+				'strike', 'strong', 'tt', 'var', 'div', 'center',
+				'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
+				'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
+			);
+			$htmlsingle = array(
+				'br', 'hr', 'li', 'dt', 'dd'
+			);
+			$htmlnest = array( # Tags that can be nested--??
+				'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
+				'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
+			);
+			$tabletags = array( # Can only appear inside table
+				'td', 'th', 'tr'
+			);
+		} else {
+			$htmlpairs = array();
+			$htmlsingle = array();
+			$htmlnest = array();
+			$tabletags = array();
+		}
+
+		$htmlsingle = array_merge( $tabletags, $htmlsingle );
+		$htmlelements = array_merge( $htmlsingle, $htmlpairs );
+
+		$htmlattrs = Sanitizer::getHTMLattrs () ;
+
+		# Remove HTML comments
+		$text = Sanitizer::removeHTMLcomments( $text );
+
+		$bits = explode( '<', $text );
+		$text = array_shift( $bits );
+		if(!$wgUseTidy) {
+			$tagstack = array(); $tablestack = array();
+			foreach ( $bits as $x ) {
+				$prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
+				preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
+				$x, $regs );
+				list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
+				error_reporting( $prev );
+
+				$badtag = 0 ;
+				if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+					# Check our stack
+					if ( $slash ) {
+						# Closing a tag...
+						if ( ! in_array( $t, $htmlsingle ) &&
+						( $ot = @array_pop( $tagstack ) ) != $t ) {
+							@array_push( $tagstack, $ot );
+							$badtag = 1;
+						} else {
+							if ( $t == 'table' ) {
+								$tagstack = array_pop( $tablestack );
+							}
+							$newparams = '';
+						}
+					} else {
+						# Keep track for later
+						if ( in_array( $t, $tabletags ) &&
+						! in_array( 'table', $tagstack ) ) {
+							$badtag = 1;
+						} else if ( in_array( $t, $tagstack ) &&
+						! in_array ( $t , $htmlnest ) ) {
+							$badtag = 1 ;
+						} else if ( ! in_array( $t, $htmlsingle ) ) {
+							if ( $t == 'table' ) {
+								array_push( $tablestack, $tagstack );
+								$tagstack = array();
+							}
+							array_push( $tagstack, $t );
+						}
+						# Strip non-approved attributes from the tag
+						$newparams = Sanitizer::fixTagAttributes($params);
+
+					}
+					if ( ! $badtag ) {
+						$rest = str_replace( '>', '&gt;', $rest );
+						$text .= "<$slash$t $newparams$brace$rest";
+						continue;
+					}
+				}
+				$text .= '&lt;' . str_replace( '>', '&gt;', $x);
+			}
+			# Close off any remaining tags
+			while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
+				$text .= "</$t>\n";
+				if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
+			}
+		} else {
+			# this might be possible using tidy itself
+			foreach ( $bits as $x ) {
+				preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
+				$x, $regs );
+				@list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
+				if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+					$newparams = Sanitizer::fixTagAttributes($params);
+					$rest = str_replace( '>', '&gt;', $rest );
+					$text .= "<$slash$t $newparams$brace$rest";
+				} else {
+					$text .= '&lt;' . str_replace( '>', '&gt;', $x);
+				}
+			}
+		}
+		wfProfileOut( $fname );
+		return $text;
+	}
+
+	/**
+	 * Remove '<!--', '-->', and everything between.
+	 * To avoid leaving blank lines, when a comment is both preceded
+	 * and followed by a newline (ignoring spaces), trim leading and
+	 * trailing spaces and one of the newlines.
+	 * 
+	 * @access private
+	 */
+	function removeHTMLcomments( $text ) {
+		$fname='Parser::removeHTMLcomments';
+		wfProfileIn( $fname );
+		while (($start = strpos($text, '<!--')) !== false) {
+			$end = strpos($text, '-->', $start + 4);
+			if ($end === false) {
+				# Unterminated comment; bail out
+				break;
+			}
+
+			$end += 3;
+
+			# Trim space and newline if the comment is both
+			# preceded and followed by a newline
+			$spaceStart = max($start - 1, 0);
+			$spaceLen = $end - $spaceStart;
+			while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
+				$spaceStart--;
+				$spaceLen++;
+			}
+			while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
+				$spaceLen++;
+			if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
+				# Remove the comment, leading and trailing
+				# spaces, and leave only one newline.
+				$text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
+			}
+			else {
+				# Remove just the comment.
+				$text = substr_replace($text, '', $start, $end - $start);
+			}
+		}
+		wfProfileOut( $fname );
+		return $text;
+	}
+
+	/**
+	 * Return allowed HTML attributes
+	 *
+	 * @access private
+	 */
+	function getHTMLattrs () {
+		$htmlattrs = array( # Allowed attributes--no scripting, etc.
+				'title', 'align', 'lang', 'dir', 'width', 'height',
+				'bgcolor', 'clear', /* BR */ 'noshade', /* HR */
+				'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color',
+				/* FONT */ 'type', 'start', 'value', 'compact',
+				/* For various lists, mostly deprecated but safe */
+				'summary', 'width', 'border', 'frame', 'rules',
+				'cellspacing', 'cellpadding', 'valign', 'char',
+				'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis',
+				'headers', 'scope', 'rowspan', 'colspan', /* Tables */
+				'id', 'class', 'name', 'style' /* For CSS */
+				);
+		return $htmlattrs ;
+	}
+
+	/**
+	 * Remove non approved attributes and javascript in css
+	 *
+	 * @access private
+	 */
+	function fixTagAttributes ( $t ) {
+		if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-)
+		$htmlattrs = Sanitizer::getHTMLattrs() ;
+
+		# Strip non-approved attributes from the tag
+		$t = preg_replace(
+			'/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e',
+			"(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
+			$t);
+
+		$t = str_replace ( '<></>' , '' , $t ) ; # This should fix bug 980557
+
+		# Strip javascript "expression" from stylesheets. Brute force approach:
+		# If anythin offensive is found, all attributes of the HTML tag are dropped
+
+		if( preg_match(
+			'/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is',
+			wfMungeToUtf8( $t ) ) )
+		{
+			$t='';
+		}
+
+		return trim ( $t ) ;
+	}
+
+}
+
+?>
\ No newline at end of file
-- 
2.20.1