From 565e9fa077f127449a7141552546e889869fe09a Mon Sep 17 00:00:00 2001
From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= <matma.rex@gmail.com>
Date: Sun, 2 Nov 2014 19:14:53 +0100
Subject: [PATCH] Correctly parse <indicator/> contents, Parser rejiggering

includes/parser/Parser.php
  * Pull out a chunk of code we need to reuse from parse() to
    internalParseHalfParsed(). This is a fully backwards-compatible
    change.

    Code changes:
    * Add a guard for running ParserBeforeTidy and ParserAfterTidy
      hooks, as extensions might not expect them to be called for
      snippets, only full page content.
    * Change $options to $this->mOptions.

    The bulk of parsing work is now done in internalParse() and
    internalParseHalfParsed(), parse() only handles four things:
    * Resetting parser state when a parse starts/finishes
    * Page title language conversion
    * Outputting limit report and limitation warnings
    * Running ParserAfterParse hook (dunno why, but it's documented)

  * Expand documentation for recursiveTagParse(), with some uppercase
    warnings so that no one does the stupid thing I did ever again.

  * Add new public method recursiveTagParseFully(), which is a
    recursive parser entry point that produces fully parsed HTML ready
    for inclusion in HTML output. Compared to Parser::parse(), it
    doesn't produce limit reports and doesn't run the ParserAfterParse
    hook.

includes/parser/CoreTagHooks.php
  * Use the new recursiveTagParseFully() method.
  * Use Parser::stripOuterParagraph() to remove silly tags.

Bug: 72887
Change-Id: I89ae9a50b82245f9a9e4a903563aeb1c51b6103e
---
 includes/parser/CoreTagHooks.php |   2 +-
 includes/parser/Parser.php       | 221 +++++++++++++++++++------------
 2 files changed, 140 insertions(+), 83 deletions(-)
diff --git a/includes/parser/CoreTagHooks.php b/includes/parser/CoreTagHooks.php
index df868eabd5..9755ea93f6 100644
--- a/includes/parser/CoreTagHooks.php
+++ b/includes/parser/CoreTagHooks.php
@@ -141,7 +141,7 @@ class CoreTagHooks {
 
 		$parser->getOutput()->setIndicator(
 			trim( $attributes['name'] ),
-			$parser->recursiveTagParse( $content, $frame )
+			Parser::stripOuterParagraph( $parser->recursiveTagParseFully( $content, $frame ) )
 		);
 
 		return '';
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index fe0c81f338..e6486ff79f 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -389,7 +389,7 @@ class Parser {
 		 * to internalParse() which does all the real work.
 		 */
 
-		global $wgUseTidy, $wgAlwaysUseTidy, $wgShowHostnames;
+		global $wgShowHostnames;
 		$fname = __METHOD__ . '-' . wfGetCaller();
 		wfProfileIn( __METHOD__ );
 		wfProfileIn( $fname );
@@ -430,40 +430,7 @@ class Parser {
 		$text = $this->internalParse( $text );
 		wfRunHooks( 'ParserAfterParse', array( &$this, &$text, &$this->mStripState ) );
 
-		$text = $this->mStripState->unstripGeneral( $text );
-
-		# Clean up special characters, only run once, next-to-last before doBlockLevels
-		$fixtags = array(
-			# french spaces, last one Guillemet-left
-			# only if there is something before the space
-			'/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1&#160;',
-			# french spaces, Guillemet-right
-			'/(\\302\\253) /' => '\\1&#160;',
-			'/&#160;(!\s*important)/' => ' \\1', # Beware of CSS magic word !important, bug #11874.
-		);
-		$text = preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
-
-		$text = $this->doBlockLevels( $text, $linestart );
-
-		$this->replaceLinkHolders( $text );
-
-		/**
-		 * The input doesn't get language converted if
-		 * a) It's disabled
-		 * b) Content isn't converted
-		 * c) It's a conversion table
-		 * d) it is an interface message (which is in the user language)
-		 */
-		if ( !( $options->getDisableContentConversion()
-			|| isset( $this->mDoubleUnderscores['nocontentconvert'] ) )
-		) {
-			if ( !$this->mOptions->getInterfaceMessage() ) {
-				# The position of the convert() call should not be changed. it
-				# assumes that the links are all replaced and the only thing left
-				# is the <nowiki> mark.
-				$text = $this->getConverterLanguage()->convert( $text );
-			}
-		}
+		$text = $this->internalParseHalfParsed( $text, true, $linestart );
 
 		/**
 		 * A converted title will be provided in the output object if title and
@@ -486,45 +453,6 @@ class Parser {
 			}
 		}
 
-		$text = $this->mStripState->unstripNoWiki( $text );
-
-		wfRunHooks( 'ParserBeforeTidy', array( &$this, &$text ) );
-
-		$text = $this->replaceTransparentTags( $text );
-		$text = $this->mStripState->unstripGeneral( $text );
-
-		$text = Sanitizer::normalizeCharReferences( $text );
-
-		if ( ( $wgUseTidy && $this->mOptions->getTidy() ) || $wgAlwaysUseTidy ) {
-			$text = MWTidy::tidy( $text );
-		} else {
-			# attempt to sanitize at least some nesting problems
-			# (bug #2702 and quite a few others)
-			$tidyregs = array(
-				# ''Something [http://www.cool.com cool''] -->
-				# <i>Something</i><a href="http://www.cool.com"..><i>cool></i></a>
-				'/(<([bi])>)(<([bi])>)?([^<]*)(<\/?a[^<]*>)([^<]*)(<\/\\4>)?(<\/\\2>)/' =>
-				'\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9',
-				# fix up an anchor inside another anchor, only
-				# at least for a single single nested link (bug 3695)
-				'/(<a[^>]+>)([^<]*)(<a[^>]+>[^<]*)<\/a>(.*)<\/a>/' =>
-				'\\1\\2</a>\\3</a>\\1\\4</a>',
-				# fix div inside inline elements- doBlockLevels won't wrap a line which
-				# contains a div, so fix it up here; replace
-				# div with escaped text
-				'/(<([aib]) [^>]+>)([^<]*)(<div([^>]*)>)(.*)(<\/div>)([^<]*)(<\/\\2>)/' =>
-				'\\1\\3&lt;div\\5&gt;\\6&lt;/div&gt;\\8\\9',
-				# remove empty italic or bold tag pairs, some
-				# introduced by rules above
-				'/<([bi])><\/\\1>/' => '',
-			);
-
-			$text = preg_replace(
-				array_keys( $tidyregs ),
-				array_values( $tidyregs ),
-				$text );
-		}
-
 		if ( $this->mExpensiveFunctionCount > $this->mOptions->getExpensiveParserFunctionLimit() ) {
 			$this->limitationWarn( 'expensive-parserfunction',
 				$this->mExpensiveFunctionCount,
@@ -532,8 +460,6 @@ class Parser {
 			);
 		}
 
-		wfRunHooks( 'ParserAfterTidy', array( &$this, &$text ) );
-
 		# Information on include size limits, for the benefit of users who try to skirt them
 		if ( $this->mOptions->getEnableLimitReport() ) {
 			$max = $this->mOptions->getMaxIncludeSize();
@@ -621,15 +547,26 @@ class Parser {
 	}
 
 	/**
-	 * Recursive parser entry point that can be called from an extension tag
-	 * hook.
+	 * Half-parse wikitext to half-parsed HTML. This recursive parser entry point
+	 * can be called from an extension tag hook.
 	 *
-	 * If $frame is not provided, then template variables (e.g., {{{1}}}) within $text are not expanded
+	 * The output of this function IS NOT SAFE PARSED HTML; it is "half-parsed"
+	 * instead, which means that lists and links have not been fully parsed yet,
+	 * and strip markers are still present.
+	 *
+	 * Use recursiveTagParseFully() to fully parse wikitext to output-safe HTML.
+	 *
+	 * Use this function if you're a parser tag hook and you want to parse
+	 * wikitext before or after applying additional transformations, and you
+	 * intend to *return the result as hook output*, which will cause it to go
+	 * through the rest of parsing process automatically.
+	 *
+	 * If $frame is not provided, then template variables (e.g., {{{1}}}) within
+	 * $text are not expanded
 	 *
 	 * @param string $text Text extension wants to have parsed
 	 * @param bool|PPFrame $frame The frame to use for expanding any template variables
-	 *
-	 * @return string
+	 * @return string UNSAFE half-parsed HTML
 	 */
 	public function recursiveTagParse( $text, $frame = false ) {
 		wfProfileIn( __METHOD__ );
@@ -640,6 +577,31 @@ class Parser {
 		return $text;
 	}
 
+	/**
+	 * Fully parse wikitext to fully parsed HTML. This recursive parser entry
+	 * point can be called from an extension tag hook.
+	 *
+	 * The output of this function is fully-parsed HTML that is safe for output.
+	 * If you're a parser tag hook, you might want to use recursiveTagParse()
+	 * instead.
+	 *
+	 * If $frame is not provided, then template variables (e.g., {{{1}}}) within
+	 * $text are not expanded
+	 *
+	 * @since 1.25
+	 *
+	 * @param string $text Text extension wants to have parsed
+	 * @param bool|PPFrame $frame The frame to use for expanding any template variables
+	 * @return string Fully parsed HTML
+	 */
+	public function recursiveTagParseFully( $text, $frame = false ) {
+		wfProfileIn( __METHOD__ );
+		$text = $this->recursiveTagParse( $text, $frame );
+		$text = $this->internalParseHalfParsed( $text, false );
+		wfProfileOut( __METHOD__ );
+		return $text;
+	}
+
 	/**
 	 * Expand templates and variables in the text, producing valid, static wikitext.
 	 * Also removes comments.
@@ -1227,7 +1189,7 @@ class Parser {
 	}
 
 	/**
-	 * Helper function for parse() that transforms wiki markup into
+	 * Helper function for parse() that transforms wiki markup into half-parsed
 	 * HTML. Only called for $mOutputType == self::OT_HTML.
 	 *
 	 * @private
@@ -1300,6 +1262,101 @@ class Parser {
 		return $text;
 	}
 
+	/**
+	 * Helper function for parse() that transforms half-parsed HTML into fully
+	 * parsed HTML.
+	 *
+	 * @param string $text
+	 * @param bool $isMain
+	 * @param bool $linestart
+	 * @return string
+	 */
+	private function internalParseHalfParsed( $text, $isMain = true, $linestart = true ) {
+		global $wgUseTidy, $wgAlwaysUseTidy;
+
+		$text = $this->mStripState->unstripGeneral( $text );
+
+		# Clean up special characters, only run once, next-to-last before doBlockLevels
+		$fixtags = array(
+			# french spaces, last one Guillemet-left
+			# only if there is something before the space
+			'/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1&#160;',
+			# french spaces, Guillemet-right
+			'/(\\302\\253) /' => '\\1&#160;',
+			'/&#160;(!\s*important)/' => ' \\1', # Beware of CSS magic word !important, bug #11874.
+		);
+		$text = preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
+
+		$text = $this->doBlockLevels( $text, $linestart );
+
+		$this->replaceLinkHolders( $text );
+
+		/**
+		 * The input doesn't get language converted if
+		 * a) It's disabled
+		 * b) Content isn't converted
+		 * c) It's a conversion table
+		 * d) it is an interface message (which is in the user language)
+		 */
+		if ( !( $this->mOptions->getDisableContentConversion()
+			|| isset( $this->mDoubleUnderscores['nocontentconvert'] ) )
+		) {
+			if ( !$this->mOptions->getInterfaceMessage() ) {
+				# The position of the convert() call should not be changed. it
+				# assumes that the links are all replaced and the only thing left
+				# is the <nowiki> mark.
+				$text = $this->getConverterLanguage()->convert( $text );
+			}
+		}
+
+		$text = $this->mStripState->unstripNoWiki( $text );
+
+		if ( $isMain ) {
+			wfRunHooks( 'ParserBeforeTidy', array( &$this, &$text ) );
+		}
+
+		$text = $this->replaceTransparentTags( $text );
+		$text = $this->mStripState->unstripGeneral( $text );
+
+		$text = Sanitizer::normalizeCharReferences( $text );
+
+		if ( ( $wgUseTidy && $this->mOptions->getTidy() ) || $wgAlwaysUseTidy ) {
+			$text = MWTidy::tidy( $text );
+		} else {
+			# attempt to sanitize at least some nesting problems
+			# (bug #2702 and quite a few others)
+			$tidyregs = array(
+				# ''Something [http://www.cool.com cool''] -->
+				# <i>Something</i><a href="http://www.cool.com"..><i>cool></i></a>
+				'/(<([bi])>)(<([bi])>)?([^<]*)(<\/?a[^<]*>)([^<]*)(<\/\\4>)?(<\/\\2>)/' =>
+				'\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9',
+				# fix up an anchor inside another anchor, only
+				# at least for a single single nested link (bug 3695)
+				'/(<a[^>]+>)([^<]*)(<a[^>]+>[^<]*)<\/a>(.*)<\/a>/' =>
+				'\\1\\2</a>\\3</a>\\1\\4</a>',
+				# fix div inside inline elements- doBlockLevels won't wrap a line which
+				# contains a div, so fix it up here; replace
+				# div with escaped text
+				'/(<([aib]) [^>]+>)([^<]*)(<div([^>]*)>)(.*)(<\/div>)([^<]*)(<\/\\2>)/' =>
+				'\\1\\3&lt;div\\5&gt;\\6&lt;/div&gt;\\8\\9',
+				# remove empty italic or bold tag pairs, some
+				# introduced by rules above
+				'/<([bi])><\/\\1>/' => '',
+			);
+
+			$text = preg_replace(
+				array_keys( $tidyregs ),
+				array_values( $tidyregs ),
+				$text );
+		}
+
+		if ( $isMain ) {
+			wfRunHooks( 'ParserAfterTidy', array( &$this, &$text ) );
+		}
+
+		return $text;
+	}
+
 	/**
 	 * Replace special strings like "ISBN xxx" and "RFC xxx" with
 	 * magic external links.
-- 
2.20.1