From 3da7dcf91daa8d26b111853f144376c0b42c69c7 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Thu, 1 Jun 2006 19:38:14 +0000
Subject: [PATCH] Fix regressions in parser with incomplete tag stripping, plus
 some old bugs: * (bug 885) Pre-save transform no longer silently appends
 close tags * Pre-save transform no longer changes the case of close tags

---
 RELEASE-NOTES       |   2 +
 includes/Parser.php | 116 +++++++++++++++++++++-----------------------
 2 files changed, 57 insertions(+), 61 deletions(-)
diff --git a/RELEASE-NOTES b/RELEASE-NOTES
index a4525533fb..f16adbeb38 100644
--- a/RELEASE-NOTES
+++ b/RELEASE-NOTES
@@ -405,6 +405,8 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
   further parsing (<ref>-style). There should no longer be surprise
   expansion of foreign extensions inside HTML output, or differences
   in behavior based on the order tags are loaded.
+* (bug 885) Pre-save transform no longer silently appends close tags
+* Pre-save transform no longer changes the case of close tags
 
 
 == Compatibility ==
diff --git a/includes/Parser.php b/includes/Parser.php
index bc08e4da89..f969f7b910 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -311,20 +311,20 @@ class Parser
 	function getOptions() { return $this->mOptions; }
 
 	/**
-	 * Replaces all occurrences of <$tag>content</$tag> in the text
-	 * with a random marker and returns the new text. the output parameter
-	 * $content will be an associative array filled with data on the form
-	 * $unique_marker => content.
+	 * Replaces all occurrences of HTML-style comments and the given tags
+	 * in the text with a random marker and returns teh next text. The output
+	 * parameter $matches will be an associative array filled with data in
+	 * the form:
+	 *   'UNIQ-xxxxx' => array(
+	 *     'element',
+	 *     'tag content',
+	 *     array( 'param' => 'x' ),
+	 *     '<element param="x">tag content</element>' ) )
 	 *
-	 * If $content is already set, the additional entries will be appended
-	 * If $tag is set to STRIP_COMMENTS, the function will extract
-	 * <!-- HTML comments -->
+	 * @param $elements list of element names. Comments are always extracted.
+	 * @param $text Source text string.
+	 * @param $uniq_prefix
 	 *
-	 * $output: array( 'UNIQ-xxxxx' => array(
-	 *    'element',
-	 *    'tag content',
-	 *    array( 'param' => 'x' ),
-	 *    '<element param="x">' ) )
 	 * @private
 	 * @static
 	 */
@@ -334,58 +334,59 @@ class Parser
 		$stripped = '';
 		$matches = array();
 
-		if( $elements == STRIP_COMMENTS ) {
-			$start = '/<!--()()/';
-		} else {
-			$taglist = implode( '|', $elements );
-			$start = "/<($taglist)(\\s+[^>]*|\\s*\/?)>/i";
-		}
+		$taglist = implode( '|', $elements );
+		$start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
 
 		while ( '' != $text ) {
 			$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
 			$stripped .= $p[0];
-			if( count( $p ) < 4 ) {
+			if( count( $p ) < 5 ) {
 				break;
 			}
-			$element    = $p[1];
-			$attributes = $p[2];
-			$inside     = $p[3];
-
-			// If $attributes ends with '/', we have an empty element tag, <tag />
-			if( $element != '' && substr( $attributes, -1 ) == '/' ) {
-				$attributes = substr( $attributes, 0, -1);
-				$empty = '/';
+			if( count( $p ) > 5 ) {
+				// comment
+				$element    = $p[4];
+				$attributes = '';
+				$close      = '';
+				$inside     = $p[5];
 			} else {
-				$empty = '';
+				// tag
+				$element    = $p[1];
+				$attributes = $p[2];
+				$close      = $p[3];
+				$inside     = $p[4];
 			}
 
 			$marker = "$uniq_prefix-$element-$rand" . sprintf('%08X', $n++);
 			$stripped .= $marker;
 
-			if ( $empty === '/' ) {
+			if ( $close === '/>' ) {
 				// Empty element tag, <tag />
 				$content = null;
 				$text = $inside;
+				$tail = null;
 			} else {
-				if( $element ) {
-					$end = "/<\\/$element\\s*>/i";
+				if( $element == '!--' ) {
+					$end = '/(-->)/';
 				} else {
-					$end = '/-->/';
+					$end = "/(<\\/$element\\s*>)/i";
 				}
-				$q = preg_split( $end, $inside, 2 );
+				$q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
 				$content = $q[0];
-				if( count( $q ) < 2 ) {
+				if( count( $q ) < 3 ) {
 					# No end tag -- let it run out to the end of the text.
+					$tail = '';
 					$text = '';
 				} else {
-					$text = $q[1];
+					$tail = $q[1];
+					$text = $q[2];
 				}
 			}
 			
 			$matches[$marker] = array( $element,
 				$content,
 				Sanitizer::decodeTagAttributes( $attributes ),
-				"<$element$attributes$empty>" );
+				"<$element$attributes$close$content$tail" );
 		}
 		return $stripped;
 	}
@@ -409,6 +410,7 @@ class Parser
 		# Replace any instances of the placeholders
 		$uniq_prefix = $this->mUniqPrefix;
 		#$text = str_replace( $uniq_prefix, wfHtmlEscapeFirst( $uniq_prefix ), $text );
+		$commentState = array();
 		
 		$elements = array_merge(
 			array( 'nowiki', 'pre', 'gallery' ),
@@ -422,27 +424,24 @@ class Parser
 		}
 		
 
-		// Strip comments in a first pass.
-		// This saves us from needlessly rendering extensions in comment text
-		$text = Parser::extractTagsAndParams(STRIP_COMMENTS, $text, $comment_matches, $uniq_prefix);
-		$commentState = array();
-		foreach( $comment_matches as $marker => $data ){
-			list( $element, $content, $params, $tag ) = $data;
-			$commentState[$marker] = '<!--' . $content . '-->';
-		}
-		
 		$matches = array();
 		$text = Parser::extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
 		
 		foreach( $matches as $marker => $data ) {
 			list( $element, $content, $params, $tag ) = $data;
-			// Restore any comments; the extension can deal with them.
-			if( $content !== null) {
-				$content = strtr( $content, $commentState );
-			}
 			if( $render ) {
 				$tagName = strtolower( $element );
 				switch( $tagName ) {
+				case '!--':
+					// Comment
+					if( substr( $tag, -3 ) == '-->' ) {
+						$output = $tag;
+					} else {
+						// Unclosed comment in input.
+						// Close it so later stripping can remove it
+						$output = "$tag-->";
+					}
+					break;
 				case 'html':
 					if( $wgRawHtml ) {
 						$output = $content;
@@ -473,25 +472,20 @@ class Parser
 				}
 			} else {
 				// Just stripping tags; keep the source
-				if( $content === null ) {
-					$output = $tag;
-				} else {
-					$output = "$tag$content</$element>";
-				}
+				$output = $tag;
+			}
+			if( !$stripcomments && $element == '!--' ) {
+				$commentState[$marker] = $output;
+			} else {
+				$state[$element][$marker] = $output;
 			}
-			$state[$element][$marker] = $output;
 		}
 
 		# Unstrip comments unless explicitly told otherwise.
 		# (The comments are always stripped prior to this point, so as to
 		# not invoke any extension tags / parser hooks contained within
 		# a comment.)
-		if ( $stripcomments ) {
-			// Add remaining comments to the state array
-			foreach( $commentState as $marker => $content ) {
-				$state['comment'][$marker] = $content;
-			}
-		} else {
+		if ( !$stripcomments ) {
 			// Put them all back and forget them
 			$text = strtr( $text, $commentState );
 		}
-- 
2.20.1