From 48b0722fd977cf9c2169d3d9e03b4e6935e0099e Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Thu, 1 Jun 2006 06:16:55 +0000
Subject: [PATCH] * (bug 5384) Fix <!-- comments --> in <ref> extension *
 Nesting of different tag extensions and comments should now work more  
 consistently and more safely. A cleaner, one-pass tag strip lets the  
 'outer' tag either take source (<nowiki>-style) or pass it down to   further
 parsing (<ref>-style). There should no longer be surprise   expansion of
 foreign extensions inside HTML output, or differences   in behavior based on
 the order tags are loaded.

---
 RELEASE-NOTES               |   7 +
 includes/Parser.php         | 268 +++++++++++++++---------------------
 maintenance/parserTests.txt |   1 +
 3 files changed, 120 insertions(+), 156 deletions(-)
diff --git a/RELEASE-NOTES b/RELEASE-NOTES
index 0eb51eded3..a4525533fb 100644
--- a/RELEASE-NOTES
+++ b/RELEASE-NOTES
@@ -398,6 +398,13 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
 * parserTests.php accepts a --file parameter to run an alternate test sutie
 * parser tests can now test extensions using !!hooks sections
 * Fix oddity with open tag parameters getting stuck on </li>
+* (bug 5384) Fix <!-- comments --> in <ref> extension
+* Nesting of different tag extensions and comments should now work more
+  consistently and more safely. A cleaner, one-pass tag strip lets the
+  'outer' tag either take source (<nowiki>-style) or pass it down to
+  further parsing (<ref>-style). There should no longer be surprise
+  expansion of foreign extensions inside HTML output, or differences
+  in behavior based on the order tags are loaded.
 
 
 == Compatibility ==
diff --git a/includes/Parser.php b/includes/Parser.php
index c6029d03a0..6dacd60dcc 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -9,6 +9,7 @@
 /** */
 require_once( 'Sanitizer.php' );
 require_once( 'HttpFunctions.php' );
+require_once( 'ImageGallery.php' );
 
 /**
  * Update this version number when the ParserOutput format
@@ -319,63 +320,60 @@ class Parser
 	 * If $tag is set to STRIP_COMMENTS, the function will extract
 	 * <!-- HTML comments -->
 	 *
+	 * $output: array( 'UNIQ-xxxxx' => array(
+	 *    'element',
+	 *    'tag content',
+	 *    array( 'param' => 'x' ),
+	 *    '<element param="x">' ) )
 	 * @private
 	 * @static
 	 */
-	function extractTagsAndParams($tag, $text, &$content, &$tags, &$params, $uniq_prefix = ''){
-		$rnd = $uniq_prefix . '-' . $tag . Parser::getRandomString();
-		if ( !$content ) {
-			$content = array( );
-		}
+	function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
+		$rand = Parser::getRandomString();
 		$n = 1;
 		$stripped = '';
+		$matches = array();
 
-		if ( !$tags ) {
-			$tags = array( );
-		}
-
-		if ( !$params ) {
-			$params = array( );
-		}
-
-		if( $tag == STRIP_COMMENTS ) {
-			$start = '/<!--()/';
-			$end   = '/-->/';
+		if( $elements == STRIP_COMMENTS ) {
+			$start = '/<!--()()/';
 		} else {
-			$start = "/<$tag(\\s+[^>]*|\\s*\/?)>/i";
-			$end   = "/<\\/$tag\\s*>/i";
+			$taglist = implode( '|', $elements );
+			$start = "/<($taglist)(\\s+[^>]*|\\s*\/?)>/i";
 		}
 
 		while ( '' != $text ) {
 			$p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
 			$stripped .= $p[0];
-			if( count( $p ) < 3 ) {
+			if( count( $p ) < 4 ) {
 				break;
 			}
-			$attributes = $p[1];
-			$inside     = $p[2];
+			$element    = $p[1];
+			$attributes = $p[2];
+			$inside     = $p[3];
 
 			// If $attributes ends with '/', we have an empty element tag, <tag />
-			if( $tag != STRIP_COMMENTS && substr( $attributes, -1 ) == '/' ) {
+			if( $element != '' && substr( $attributes, -1 ) == '/' ) {
 				$attributes = substr( $attributes, 0, -1);
 				$empty = '/';
 			} else {
 				$empty = '';
 			}
 
-			$marker = $rnd . sprintf('%08X', $n++);
+			$marker = "$uniq_prefix-$element-$rand" . sprintf('%08X', $n++);
 			$stripped .= $marker;
 
-			$tags[$marker] = "<$tag$attributes$empty>";
-			$params[$marker] = Sanitizer::decodeTagAttributes( $attributes );
-
 			if ( $empty === '/' ) {
 				// Empty element tag, <tag />
-				$content[$marker] = null;
+				$content = null;
 				$text = $inside;
 			} else {
+				if( $element ) {
+					$end = "/<\\/$element\\s*>/i";
+				} else {
+					$end = '/-->/';
+				}
 				$q = preg_split( $end, $inside, 2 );
-				$content[$marker] = $q[0];
+				$content = $q[0];
 				if( count( $q ) < 2 ) {
 					# No end tag -- let it run out to the end of the text.
 					break;
@@ -383,26 +381,15 @@ class Parser
 					$text = $q[1];
 				}
 			}
+			
+			$matches[$marker] = array( $element,
+				$content,
+				Sanitizer::decodeTagAttributes( $attributes ),
+				"<$element$attributes$empty>" );
 		}
 		return $stripped;
 	}
 
-	/**
-	 * Wrapper function for extractTagsAndParams
-	 * for cases where $tags and $params isn't needed
-	 * i.e. where tags will never have params, like <nowiki>
-	 *
-	 * @private
-	 * @static
-	 */
-	function extractTags( $tag, $text, &$content, $uniq_prefix = '' ) {
-		$dummy_tags = array();
-		$dummy_params = array();
-
-		return Parser::extractTagsAndParams( $tag, $text, $content,
-			$dummy_tags, $dummy_params, $uniq_prefix );
-	}
-
 	/**
 	 * Strips and renders nowiki, pre, math, hiero
 	 * If $render is set, performs necessary rendering operations on plugins
@@ -418,124 +405,102 @@ class Parser
 	 */
 	function strip( $text, &$state, $stripcomments = false ) {
 		$render = ($this->mOutputType == OT_HTML);
-		$html_content = array();
-		$nowiki_content = array();
-		$math_content = array();
-		$pre_content = array();
-		$comment_content = array();
-		$ext_content = array();
-		$ext_tags = array();
-		$ext_params = array();
-		$gallery_content = array();
 
 		# Replace any instances of the placeholders
 		$uniq_prefix = $this->mUniqPrefix;
 		#$text = str_replace( $uniq_prefix, wfHtmlEscapeFirst( $uniq_prefix ), $text );
-
-		# html
+		
+		$elements = array_merge(
+			array( 'nowiki', 'pre', 'gallery' ),
+			array_keys( $this->mTagHooks ) );
 		global $wgRawHtml;
 		if( $wgRawHtml ) {
-			$text = Parser::extractTags('html', $text, $html_content, $uniq_prefix);
-			foreach( $html_content as $marker => $content ) {
-				if ($render ) {
-					# Raw and unchecked for validity.
-					$state['html'][$marker] = $content;
-				} else {
-					$state['html'][$marker] = '<html>'.$content.'</html>';
-				}
-			}
+			$elements[] = 'html';
 		}
-
-		# nowiki
-		$text = Parser::extractTags('nowiki', $text, $nowiki_content, $uniq_prefix);
-		foreach( $nowiki_content as $marker => $content ) {
-			if( $render ){
-				$state['nowiki'][$marker] = wfEscapeHTMLTagsOnly( $content );
-			} else {
-				$state['nowiki'][$marker] = '<nowiki>'.$content.'</nowiki>';
-			}
-		}
-
-		# math
 		if( $this->mOptions->getUseTeX() ) {
-			$text = Parser::extractTags('math', $text, $math_content, $uniq_prefix);
-			foreach( $math_content as $marker => $content ){
-				if( $render ) {
-					$state['math'][$marker] = renderMath( $content );
-				} else {
-					$state['math'][$marker] = '<math>'.$content.'</math>';
-				}
-			}
+			$elements[] = 'math';
 		}
+		
 
-		# pre
-		$text = Parser::extractTags('pre', $text, $pre_content, $uniq_prefix);
-		foreach( $pre_content as $marker => $content ){
-			if( $render ){
-				$state['pre'][$marker] = '<pre>' . wfEscapeHTMLTagsOnly( $content ) . '</pre>';
-			} else {
-				$state['pre'][$marker] = '<pre>'.$content.'</pre>';
-			}
+		// Strip comments in a first pass.
+		// This saves us from needlessly rendering extensions in comment text
+		$text = Parser::extractTagsAndParams(STRIP_COMMENTS, $text, $comment_matches, $uniq_prefix);
+		$commentState = array();
+		foreach( $comment_matches as $marker => $data ){
+			list( $element, $content, $params, $tag ) = $data;
+			$commentState[$marker] = '<!--' . $content . '-->';
 		}
-
-		# gallery
-		$text = Parser::extractTags('gallery', $text, $gallery_content, $uniq_prefix);
-		foreach( $gallery_content as $marker => $content ) {
-			require_once( 'ImageGallery.php' );
-			if ( $render ) {
-				$state['gallery'][$marker] = $this->renderImageGallery( $content );
-			} else {
-				$state['gallery'][$marker] = '<gallery>'.$content.'</gallery>';
+		
+		$matches = array();
+		$text = Parser::extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
+		
+		foreach( $matches as $marker => $data ) {
+			list( $element, $content, $params, $tag ) = $data;
+			// Restore any comments; the extension can deal with them.
+			if( $content !== null) {
+				$content = strtr( $content, $commentState );
 			}
-		}
-
-		# Comments
-		$text = Parser::extractTags(STRIP_COMMENTS, $text, $comment_content, $uniq_prefix);
-		foreach( $comment_content as $marker => $content ){
-			$comment_content[$marker] = '<!--'.$content.'-->';
-		}
-
-		# Extensions
-		foreach ( $this->mTagHooks as $tag => $callback ) {
-			$ext_content[$tag] = array();
-			$text = Parser::extractTagsAndParams( $tag, $text, $ext_content[$tag],
-				$ext_tags[$tag], $ext_params[$tag], $uniq_prefix );
-			foreach( $ext_content[$tag] as $marker => $content ) {
-				$full_tag = $ext_tags[$tag][$marker];
-				$params = $ext_params[$tag][$marker];
-				if ( $render )
-					$state[$tag][$marker] = call_user_func_array( $callback, array( $content, $params, $this ) );
-				else {
-					if ( is_null( $content ) ) {
-						// Empty element tag
-						$state[$tag][$marker] = $full_tag;
+			if( $render ) {
+				switch( $element ) {
+				case 'html':
+					if( $wgRawHtml ) {
+						$output = $content;
+						break;
+					}
+					// Shouldn't happen otherwise. :)
+				case 'nowiki':
+					$output = wfEscapeHTMLTagsOnly( $content );
+					break;
+				case 'math':
+					$output = renderMath( $content );
+					break;
+				case 'pre':
+					// Backwards-compatibility hack
+					$content = preg_replace( '!<nowiki>(.*?)</nowiki>!is', '\\1', $content );
+					$output = '<pre>' . wfEscapeHTMLTagsOnly( $content ) . '</pre>';
+					break;
+				case 'gallery':
+					$output = $this->renderImageGallery( $content );
+					break;
+				default:
+					$tagName = strtolower( $element );
+					if( isset( $this->mTagHooks[$tagName] ) ) {
+						$output = call_user_func_array( $this->mTagHooks[$tagName],
+							array( $content, $params, $this ) );
 					} else {
-						$state[$tag][$marker] = "$full_tag$content</$tag>";
+						wfDebugDieBacktrace( "Invalid call hook $element" );
 					}
 				}
+			} else {
+				// Just stripping tags; keep the source
+				if( $content === null ) {
+					$output = $tag;
+				} else {
+					$output = "$tag$content</$element>";
+				}
 			}
+			$state[$element][$marker] = $output;
 		}
 
 		# Unstrip comments unless explicitly told otherwise.
 		# (The comments are always stripped prior to this point, so as to
 		# not invoke any extension tags / parser hooks contained within
 		# a comment.)
-		if ( !$stripcomments ) {
-			$tempstate = array( 'comment' => $comment_content );
-			$text = $this->unstrip( $text, $tempstate );
-			$comment_content = array();
-		} else {
-			if( !isset( $state['comment'] ) ) {
-				$state['comment'] = array();
+		if ( $stripcomments ) {
+			// Add remaining comments to the state array
+			foreach( $commentState as $marker => $content ) {
+				$state['comment'][$marker] = $content;
 			}
-			$state['comment'] += $comment_content;
+		} else {
+			// Put them all back and forget them
+			$text = strtr( $text, $commentState );
 		}
 
 		return $text;
 	}
 
 	/**
-	 * restores pre, math, and hiero removed by strip()
+	 * Restores pre, math, and other extensions removed by strip()
 	 *
 	 * always call unstripNoWiki() after this one
 	 * @private
@@ -545,20 +510,21 @@ class Parser
 			return $text;
 		}
 
-		# Must expand in reverse order, otherwise nested tags will be corrupted
-		foreach( array_reverse( $state, true ) as $tag => $contentDict ) {
+		$replacements = array();
+		foreach( $state as $tag => $contentDict ) {
 			if( $tag != 'nowiki' && $tag != 'html' ) {
-				foreach( array_reverse( $contentDict, true ) as $uniq => $content ) {
-					$text = str_replace( $uniq, $content, $text );
+				foreach( $contentDict as $uniq => $content ) {
+					$replacements[$uniq] = $content;
 				}
 			}
 		}
+		$text = strtr( $text, $replacements );
 
 		return $text;
 	}
 
 	/**
-	 * always call this after unstrip() to preserve the order
+	 * Always call this after unstrip() to preserve the order
 	 *
 	 * @private
 	 */
@@ -567,18 +533,15 @@ class Parser
 			return $text;
 		}
 
-		# Must expand in reverse order, otherwise nested tags will be corrupted
-		if( isset( $state['nowiki'] ) )
-			foreach( array_reverse( $state['nowiki'], true ) as $uniq => $content ) {
-				$text = str_replace( $uniq, $content, $text );
-			}
-
-		global $wgRawHtml;
-		if ($wgRawHtml && isset( $state['html'] ) ) {
-			foreach( array_reverse( $state['html'], true ) as $uniq => $content ) {
-				$text = str_replace( $uniq, $content, $text );
+		$replacements = array();
+		foreach( $state as $tag => $contentDict ) {
+			if( $tag == 'nowiki' || $tag == 'html' ) {
+				foreach( $contentDict as $uniq => $content ) {
+					$replacements[$uniq] = $content;
+				}
 			}
 		}
+		$text = strtr( $text, $replacements );
 
 		return $text;
 	}
@@ -593,14 +556,7 @@ class Parser
 	function insertStripItem( $text, &$state ) {
 		$rnd = $this->mUniqPrefix . '-item' . Parser::getRandomString();
 		if ( !$state ) {
-			$state = array(
-			  'html' => array(),
-			  'nowiki' => array(),
-			  'math' => array(),
-			  'pre' => array(),
-			  'comment' => array(),
-			  'gallery' => array(),
-			);
+			$state = array();
 		}
 		$state['item'][$rnd] = $text;
 		return $rnd;
diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt
index 4ff78f0430..553c8ff3fe 100644
--- a/maintenance/parserTests.txt
+++ b/maintenance/parserTests.txt
@@ -301,6 +301,7 @@ And a <a href="/wiki/Main_Page" title="Main Page">link</a>
 &lt;cite&gt;
 &lt;em&gt;
 </pre>
+
 !! end
 
 ###
-- 
2.20.1