From 565e9fa077f127449a7141552546e889869fe09a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= Date: Sun, 2 Nov 2014 19:14:53 +0100 Subject: [PATCH] Correctly parse contents, Parser rejiggering includes/parser/Parser.php * Pull out a chunk of code we need to reuse from parse() to internalParseHalfParsed(). This is a fully backwards-compatible change. Code changes: * Add a guard for running ParserBeforeTidy and ParserAfterTidy hooks, as extensions might not expect them to be called for snippets, only full page content. * Change $options to $this->mOptions. The bulk of parsing work is now done in internalParse() and internalParseHalfParsed(), parse() only handles four things: * Resetting parser state when a parse starts/finishes * Page title language conversion * Outputting limit report and limitation warnings * Running ParserAfterParse hook (dunno why, but it's documented) * Expand documentation for recursiveTagParse(), with some uppercase warnings so that no one does the stupid thing I did ever again. * Add new public method recursiveTagParseFully(), which is a recursive parser entry point that produces fully parsed HTML ready for inclusion in HTML output. Compared to Parser::parse(), it doesn't produce limit reports and doesn't run the ParserAfterParse hook. includes/parser/CoreTagHooks.php * Use the new recursiveTagParseFully() method. * Use Parser::stripOuterParagraph() to remove silly tags. Bug: 72887 Change-Id: I89ae9a50b82245f9a9e4a903563aeb1c51b6103e --- includes/parser/CoreTagHooks.php | 2 +- includes/parser/Parser.php | 221 +++++++++++++++++++------------ 2 files changed, 140 insertions(+), 83 deletions(-) diff --git a/includes/parser/CoreTagHooks.php b/includes/parser/CoreTagHooks.php index df868eabd5..9755ea93f6 100644 --- a/includes/parser/CoreTagHooks.php +++ b/includes/parser/CoreTagHooks.php @@ -141,7 +141,7 @@ class CoreTagHooks { $parser->getOutput()->setIndicator( trim( $attributes['name'] ), - $parser->recursiveTagParse( $content, $frame ) + Parser::stripOuterParagraph( $parser->recursiveTagParseFully( $content, $frame ) ) ); return ''; diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index fe0c81f338..e6486ff79f 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -389,7 +389,7 @@ class Parser { * to internalParse() which does all the real work. */ - global $wgUseTidy, $wgAlwaysUseTidy, $wgShowHostnames; + global $wgShowHostnames; $fname = __METHOD__ . '-' . wfGetCaller(); wfProfileIn( __METHOD__ ); wfProfileIn( $fname ); @@ -430,40 +430,7 @@ class Parser { $text = $this->internalParse( $text ); wfRunHooks( 'ParserAfterParse', array( &$this, &$text, &$this->mStripState ) ); - $text = $this->mStripState->unstripGeneral( $text ); - - # Clean up special characters, only run once, next-to-last before doBlockLevels - $fixtags = array( - # french spaces, last one Guillemet-left - # only if there is something before the space - '/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1 ', - # french spaces, Guillemet-right - '/(\\302\\253) /' => '\\1 ', - '/ (!\s*important)/' => ' \\1', # Beware of CSS magic word !important, bug #11874. - ); - $text = preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text ); - - $text = $this->doBlockLevels( $text, $linestart ); - - $this->replaceLinkHolders( $text ); - - /** - * The input doesn't get language converted if - * a) It's disabled - * b) Content isn't converted - * c) It's a conversion table - * d) it is an interface message (which is in the user language) - */ - if ( !( $options->getDisableContentConversion() - || isset( $this->mDoubleUnderscores['nocontentconvert'] ) ) - ) { - if ( !$this->mOptions->getInterfaceMessage() ) { - # The position of the convert() call should not be changed. it - # assumes that the links are all replaced and the only thing left - # is the mark. - $text = $this->getConverterLanguage()->convert( $text ); - } - } + $text = $this->internalParseHalfParsed( $text, true, $linestart ); /** * A converted title will be provided in the output object if title and @@ -486,45 +453,6 @@ class Parser { } } - $text = $this->mStripState->unstripNoWiki( $text ); - - wfRunHooks( 'ParserBeforeTidy', array( &$this, &$text ) ); - - $text = $this->replaceTransparentTags( $text ); - $text = $this->mStripState->unstripGeneral( $text ); - - $text = Sanitizer::normalizeCharReferences( $text ); - - if ( ( $wgUseTidy && $this->mOptions->getTidy() ) || $wgAlwaysUseTidy ) { - $text = MWTidy::tidy( $text ); - } else { - # attempt to sanitize at least some nesting problems - # (bug #2702 and quite a few others) - $tidyregs = array( - # ''Something [http://www.cool.com cool''] --> - # Somethingcool> - '/(<([bi])>)(<([bi])>)?([^<]*)(<\/?a[^<]*>)([^<]*)(<\/\\4>)?(<\/\\2>)/' => - '\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9', - # fix up an anchor inside another anchor, only - # at least for a single single nested link (bug 3695) - '/(]+>)([^<]*)(]+>[^<]*)<\/a>(.*)<\/a>/' => - '\\1\\2\\3\\1\\4', - # fix div inside inline elements- doBlockLevels won't wrap a line which - # contains a div, so fix it up here; replace - # div with escaped text - '/(<([aib]) [^>]+>)([^<]*)(]*)>)(.*)(<\/div>)([^<]*)(<\/\\2>)/' => - '\\1\\3<div\\5>\\6</div>\\8\\9', - # remove empty italic or bold tag pairs, some - # introduced by rules above - '/<([bi])><\/\\1>/' => '', - ); - - $text = preg_replace( - array_keys( $tidyregs ), - array_values( $tidyregs ), - $text ); - } - if ( $this->mExpensiveFunctionCount > $this->mOptions->getExpensiveParserFunctionLimit() ) { $this->limitationWarn( 'expensive-parserfunction', $this->mExpensiveFunctionCount, @@ -532,8 +460,6 @@ class Parser { ); } - wfRunHooks( 'ParserAfterTidy', array( &$this, &$text ) ); - # Information on include size limits, for the benefit of users who try to skirt them if ( $this->mOptions->getEnableLimitReport() ) { $max = $this->mOptions->getMaxIncludeSize(); @@ -621,15 +547,26 @@ class Parser { } /** - * Recursive parser entry point that can be called from an extension tag - * hook. + * Half-parse wikitext to half-parsed HTML. This recursive parser entry point + * can be called from an extension tag hook. * - * If $frame is not provided, then template variables (e.g., {{{1}}}) within $text are not expanded + * The output of this function IS NOT SAFE PARSED HTML; it is "half-parsed" + * instead, which means that lists and links have not been fully parsed yet, + * and strip markers are still present. + * + * Use recursiveTagParseFully() to fully parse wikitext to output-safe HTML. + * + * Use this function if you're a parser tag hook and you want to parse + * wikitext before or after applying additional transformations, and you + * intend to *return the result as hook output*, which will cause it to go + * through the rest of parsing process automatically. + * + * If $frame is not provided, then template variables (e.g., {{{1}}}) within + * $text are not expanded * * @param string $text Text extension wants to have parsed * @param bool|PPFrame $frame The frame to use for expanding any template variables - * - * @return string + * @return string UNSAFE half-parsed HTML */ public function recursiveTagParse( $text, $frame = false ) { wfProfileIn( __METHOD__ ); @@ -640,6 +577,31 @@ class Parser { return $text; } + /** + * Fully parse wikitext to fully parsed HTML. This recursive parser entry + * point can be called from an extension tag hook. + * + * The output of this function is fully-parsed HTML that is safe for output. + * If you're a parser tag hook, you might want to use recursiveTagParse() + * instead. + * + * If $frame is not provided, then template variables (e.g., {{{1}}}) within + * $text are not expanded + * + * @since 1.25 + * + * @param string $text Text extension wants to have parsed + * @param bool|PPFrame $frame The frame to use for expanding any template variables + * @return string Fully parsed HTML + */ + public function recursiveTagParseFully( $text, $frame = false ) { + wfProfileIn( __METHOD__ ); + $text = $this->recursiveTagParse( $text, $frame ); + $text = $this->internalParseHalfParsed( $text, false ); + wfProfileOut( __METHOD__ ); + return $text; + } + /** * Expand templates and variables in the text, producing valid, static wikitext. * Also removes comments. @@ -1227,7 +1189,7 @@ class Parser { } /** - * Helper function for parse() that transforms wiki markup into + * Helper function for parse() that transforms wiki markup into half-parsed * HTML. Only called for $mOutputType == self::OT_HTML. * * @private @@ -1300,6 +1262,101 @@ class Parser { return $text; } + /** + * Helper function for parse() that transforms half-parsed HTML into fully + * parsed HTML. + * + * @param string $text + * @param bool $isMain + * @param bool $linestart + * @return string + */ + private function internalParseHalfParsed( $text, $isMain = true, $linestart = true ) { + global $wgUseTidy, $wgAlwaysUseTidy; + + $text = $this->mStripState->unstripGeneral( $text ); + + # Clean up special characters, only run once, next-to-last before doBlockLevels + $fixtags = array( + # french spaces, last one Guillemet-left + # only if there is something before the space + '/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1 ', + # french spaces, Guillemet-right + '/(\\302\\253) /' => '\\1 ', + '/ (!\s*important)/' => ' \\1', # Beware of CSS magic word !important, bug #11874. + ); + $text = preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text ); + + $text = $this->doBlockLevels( $text, $linestart ); + + $this->replaceLinkHolders( $text ); + + /** + * The input doesn't get language converted if + * a) It's disabled + * b) Content isn't converted + * c) It's a conversion table + * d) it is an interface message (which is in the user language) + */ + if ( !( $this->mOptions->getDisableContentConversion() + || isset( $this->mDoubleUnderscores['nocontentconvert'] ) ) + ) { + if ( !$this->mOptions->getInterfaceMessage() ) { + # The position of the convert() call should not be changed. it + # assumes that the links are all replaced and the only thing left + # is the mark. + $text = $this->getConverterLanguage()->convert( $text ); + } + } + + $text = $this->mStripState->unstripNoWiki( $text ); + + if ( $isMain ) { + wfRunHooks( 'ParserBeforeTidy', array( &$this, &$text ) ); + } + + $text = $this->replaceTransparentTags( $text ); + $text = $this->mStripState->unstripGeneral( $text ); + + $text = Sanitizer::normalizeCharReferences( $text ); + + if ( ( $wgUseTidy && $this->mOptions->getTidy() ) || $wgAlwaysUseTidy ) { + $text = MWTidy::tidy( $text ); + } else { + # attempt to sanitize at least some nesting problems + # (bug #2702 and quite a few others) + $tidyregs = array( + # ''Something [http://www.cool.com cool''] --> + # Somethingcool> + '/(<([bi])>)(<([bi])>)?([^<]*)(<\/?a[^<]*>)([^<]*)(<\/\\4>)?(<\/\\2>)/' => + '\\1\\3\\5\\8\\9\\6\\1\\3\\7\\8\\9', + # fix up an anchor inside another anchor, only + # at least for a single single nested link (bug 3695) + '/(]+>)([^<]*)(]+>[^<]*)<\/a>(.*)<\/a>/' => + '\\1\\2\\3\\1\\4', + # fix div inside inline elements- doBlockLevels won't wrap a line which + # contains a div, so fix it up here; replace + # div with escaped text + '/(<([aib]) [^>]+>)([^<]*)(]*)>)(.*)(<\/div>)([^<]*)(<\/\\2>)/' => + '\\1\\3<div\\5>\\6</div>\\8\\9', + # remove empty italic or bold tag pairs, some + # introduced by rules above + '/<([bi])><\/\\1>/' => '', + ); + + $text = preg_replace( + array_keys( $tidyregs ), + array_values( $tidyregs ), + $text ); + } + + if ( $isMain ) { + wfRunHooks( 'ParserAfterTidy', array( &$this, &$text ) ); + } + + return $text; + } + /** * Replace special strings like "ISBN xxx" and "RFC xxx" with * magic external links. -- 2.20.1