From: Ori Livneh Date: Tue, 26 May 2015 20:48:33 +0000 (-0700) Subject: Use a fixed marker prefix string in the Parser and MWTidy X-Git-Tag: 1.31.0-rc.0~11238^2 X-Git-Url: http://git.cyclocoop.org/%24image?a=commitdiff_plain;h=12571bde26d89fd8a86b26e2d8d4012803ddb3d7;p=lhc%2Fweb%2Fwiklou.git Use a fixed marker prefix string in the Parser and MWTidy Generating one-time, unique strip markers hurts us in multiple ways: * The strip marker regexes don't benefit from JIT compilation, so they are slower to execute than they could be. * Although the regexes don't benefit from JIT compilation, they are still compiled, because HHVM bets on regexes getting reused. This extra work is fairly costly (1-2% of CPU usage on the app servers) and doesn't pay off. * The size of the PCRE JIT cache is finite, and the caching of one-off regexes displaces from the cache regexes which are in fact reused. Tim's preferred solution (per his review comment on https://gerrit.wikimedia.org/r/167530/) is to use fixed strip markers. So: * Replace usage of $parser->mUniqPrefix with Parser::MARKER_PREFIX, which complements the existing Parser::MARKER_SUFFIX. * Deprecate Parser::mUniqPrefix and its accessor, Parser::uniqPrefix(). * Deprecate Parser::getRandomString(), since it is no longer useful. * In Preprocessor_*:preprocessToObj() and Parser::fetchTemplateAndTitle, replace any occurences of \x7f with '?', to prevent strip marker forgery. \x7f is not valid input anyway. * Deprecate the $prefix parameter for StripState::__construct, since a custom prefix may no longer be specified. Change-Id: I31d4556bbb07acb72c33fda335fa5a230379a03f --- diff --git a/RELEASE-NOTES-1.26 b/RELEASE-NOTES-1.26 index e51234eb8f..af9e9d2b64 100644 --- a/RELEASE-NOTES-1.26 +++ b/RELEASE-NOTES-1.26 @@ -56,6 +56,17 @@ changes to languages because of Bugzilla reports. * Removed maintenance script deleteImageMemcached.php. * MWFunction::newObj() was removed (deprecated in 1.25). ObjectFactory::getObjectFromSpec() should be used instead. +* The parser will no longer randomize the string it uses to mark the place of + items that were stripped during parsing. It will use a fixed string instead. + This causes the parser to re-use the regular expressions it uses to search + and replace markers rather than generate novel expressions on each parse. + Re-using regular expressions will improve performance on HHVM and the + forthcoming PHP 7. The interfaces changes accompanying this change are: + - Parser::getRandomString() and Parser::uniqPrefix() have been deprecated. + - The $uniq_prefix argument for Parser::extractTagsAndParams() and the + $prefix argument for StripState::_construct() are deprecated and their + value is ignored. + == Compatibility == diff --git a/includes/parser/MWTidy.php b/includes/parser/MWTidy.php index d446ccf66c..e29ee88f6d 100644 --- a/includes/parser/MWTidy.php +++ b/includes/parser/MWTidy.php @@ -40,13 +40,10 @@ class MWTidyWrapper { */ protected $mTokens; - protected $mUniqPrefix; - protected $mMarkerIndex; public function __construct() { $this->mTokens = null; - $this->mUniqPrefix = null; } /** @@ -55,8 +52,6 @@ class MWTidyWrapper { */ public function getWrapped( $text ) { $this->mTokens = new ReplacementArray; - $this->mUniqPrefix = "\x7fUNIQ" . - dechex( mt_rand( 0, 0x7fffffff ) ) . dechex( mt_rand( 0, 0x7fffffff ) ); $this->mMarkerIndex = 0; // Replace elements with placeholders @@ -86,7 +81,7 @@ class MWTidyWrapper { * @return string */ public function replaceCallback( $m ) { - $marker = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX; + $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX; $this->mMarkerIndex++; $this->mTokens->setPair( $marker, $m[0] ); return $marker; diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 27de03929d..f86e73194a 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -114,8 +114,20 @@ class Parser { const OT_MSG = 3; const OT_PLAIN = 4; # like extractSections() - portions of the original are returned unchanged. - # Marker Suffix needs to be accessible staticly. + /** + * @var string Prefix and suffix for temporary replacement strings + * for the multipass parser. + * + * \x7f should never appear in input as it's disallowed in XML. + * Using it at the front also gives us a little extra robustness + * since it shouldn't match when butted up against identifier-like + * string constructs. + * + * Must not consist of all title characters, or else it will change + * the behavior of in a link. + */ const MARKER_SUFFIX = "-QINU\x7f"; + const MARKER_PREFIX = "\x7fUNIQ-"; # Markers used for wrapping the table of contents const TOC_START = ''; @@ -206,9 +218,10 @@ class Parser { public $mInputSize = false; # For {{PAGESIZE}} on current page. /** - * @var string - */ - public $mUniqPrefix; + * @var string Deprecated accessor for the strip marker prefix. + * @deprecated since 1.26; use Parser::MARKER_PREFIX instead. + **/ + public $mUniqPrefix = Parser::MARKER_PREFIX; /** * @var array Array with the language name of each language link (i.e. the @@ -336,18 +349,7 @@ class Parser { $this->mLangLinkLanguages = array(); $this->currentRevisionCache = null; - /** - * Prefix for temporary replacement strings for the multipass parser. - * \x07 should never appear in input as it's disallowed in XML. - * Using it at the front also gives us a little extra robustness - * since it shouldn't match when butted up against identifier-like - * string constructs. - * - * Must not consist of all title characters, or else it will change - * the behavior of in a link. - */ - $this->mUniqPrefix = "\x7fUNIQ" . self::getRandomString(); - $this->mStripState = new StripState( $this->mUniqPrefix ); + $this->mStripState = new StripState; # Clear these on every parse, bug 4549 $this->mTplRedirCache = $this->mTplDomCache = array(); @@ -399,6 +401,9 @@ class Parser { global $wgShowHostnames; if ( $clearState ) { + // We use U+007F DELETE to construct strip markers, so we have to make + // sure that this character does not occur in the input text. + $text = strtr( $text, "\x7f", "?" ); $magicScopeVariable = $this->lock(); } @@ -410,11 +415,6 @@ class Parser { $this->mOutput->resetParseStartTime(); } - # Remove the strip marker tag prefix from the input, if present. - if ( $clearState ) { - $text = str_replace( $this->mUniqPrefix, '', $text ); - } - $oldRevisionId = $this->mRevisionId; $oldRevisionObject = $this->mRevisionObject; $oldRevisionTimestamp = $this->mRevisionTimestamp; @@ -686,8 +686,10 @@ class Parser { * Get a random string * * @return string + * @deprecated since 1.26; use wfRandomString() instead. */ public static function getRandomString() { + wfDeprecated( __METHOD__, '1.26' ); return wfRandomString( 16 ); } @@ -705,18 +707,11 @@ class Parser { * Accessor for mUniqPrefix. * * @return string + * @deprecated since 1.26; use Parser::MARKER_PREFIX instead. */ public function uniqPrefix() { - if ( !isset( $this->mUniqPrefix ) ) { - # @todo FIXME: This is probably *horribly wrong* - # LanguageConverter seems to want $wgParser's uniqPrefix, however - # if this is called for a parser cache hit, the parser may not - # have ever been initialized in the first place. - # Not really sure what the heck is supposed to be going on here. - return ''; - # throw new MWException( "Accessing uninitialized mUniqPrefix" ); - } - return $this->mUniqPrefix; + wfDeprecated( __METHOD__, '1.26' ); + return self::MARKER_PREFIX; } /** @@ -907,10 +902,14 @@ class Parser { * @param array $elements List of element names. Comments are always extracted. * @param string $text Source text string. * @param array $matches Out parameter, Array: extracted tags - * @param string $uniq_prefix + * @param string|null $uniq_prefix * @return string Stripped text + * @since 1.26 The uniq_prefix argument is deprecated. */ - public static function extractTagsAndParams( $elements, $text, &$matches, $uniq_prefix = '' ) { + public static function extractTagsAndParams( $elements, $text, &$matches, $uniq_prefix = null ) { + if ( $uniq_prefix !== null ) { + wfDeprecated( __METHOD__ . ' called with $prefix argument', '1.26' ); + } static $n = 1; $stripped = ''; $matches = array(); @@ -938,7 +937,7 @@ class Parser { $inside = $p[4]; } - $marker = "$uniq_prefix-$element-" . sprintf( '%08X', $n++ ) . self::MARKER_SUFFIX; + $marker = self::MARKER_PREFIX . "-$element-" . sprintf( '%08X', $n++ ) . self::MARKER_SUFFIX; $stripped .= $marker; if ( $close === '/>' ) { @@ -991,10 +990,10 @@ class Parser { * @return string */ public function insertStripItem( $text ) { - $rnd = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}-" . self::MARKER_SUFFIX; + $marker = self::MARKER_PREFIX . "-item-{$this->mMarkerIndex}-" . self::MARKER_SUFFIX; $this->mMarkerIndex++; - $this->mStripState->addGeneral( $rnd, $text ); - return $rnd; + $this->mStripState->addGeneral( $marker, $text ); + return $marker; } /** @@ -1257,7 +1256,7 @@ class Parser { # replaceInternalLinks may sometimes leave behind # absolute URLs, which have to be masked to hide them from replaceExternalLinks - $text = str_replace( $this->mUniqPrefix . 'NOPARSE', '', $text ); + $text = str_replace( self::MARKER_PREFIX . 'NOPARSE', '', $text ); $text = $this->doMagicLinks( $text ); $text = $this->formatHeadings( $text, $origText, $isMain ); @@ -2355,7 +2354,7 @@ class Parser { */ public function armorLinks( $text ) { return preg_replace( '/\b((?i)' . $this->mUrlProtocols . ')/', - "{$this->mUniqPrefix}NOPARSE$1", $text ); + self::MARKER_PREFIX . "NOPARSE$1", $text ); } /** @@ -2627,7 +2626,7 @@ class Parser { $closematch = preg_match( '/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|' . 'mUniqPrefix + . self::MARKER_PREFIX . '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS', $t ); @@ -3896,7 +3895,11 @@ class Parser { // Defaults to Parser::statelessFetchTemplate() $templateCb = $this->mOptions->getTemplateCallback(); $stuff = call_user_func( $templateCb, $title, $this ); + // We use U+007F DELETE to distinguish strip markers from regular text. $text = $stuff['text']; + if ( is_string( $stuff['text'] ) ) { + $text = strtr( $text, "\x7f", "?" ); + } $finalTitle = isset( $stuff['finalTitle'] ) ? $stuff['finalTitle'] : $title; if ( isset( $stuff['deps'] ) ) { foreach ( $stuff['deps'] as $dep ) { @@ -4190,7 +4193,7 @@ class Parser { $name = $frame->expand( $params['name'] ); $attrText = !isset( $params['attr'] ) ? null : $frame->expand( $params['attr'] ); $content = !isset( $params['inner'] ) ? null : $frame->expand( $params['inner'] ); - $marker = "{$this->mUniqPrefix}-$name-" + $marker = self::MARKER_PREFIX . "-$name-" . sprintf( '%08X', $this->mMarkerIndex++ ) . self::MARKER_SUFFIX; $isFunctionTag = isset( $this->mFunctionTagHooks[strtolower( $name )] ) && @@ -4435,7 +4438,7 @@ class Parser { $prevlevel = 0; $toclevel = 0; $prevtoclevel = 0; - $markerRegex = "{$this->mUniqPrefix}-h-(\d+)-" . self::MARKER_SUFFIX; + $markerRegex = self::MARKER_PREFIX . "-h-(\d+)-" . self::MARKER_SUFFIX; $baseTitleText = $this->mTitle->getPrefixedDBkey(); $oldType = $this->mOutputType; $this->setOutputType( self::OT_WIKI ); @@ -5774,7 +5777,7 @@ class Parser { public function replaceTransparentTags( $text ) { $matches = array(); $elements = array_keys( $this->mTransparentTagHooks ); - $text = self::extractTagsAndParams( $elements, $text, $matches, $this->mUniqPrefix ); + $text = self::extractTagsAndParams( $elements, $text, $matches ); $replacements = array(); foreach ( $matches as $marker => $data ) { @@ -6233,7 +6236,7 @@ class Parser { $i = 0; $out = ''; while ( $i < strlen( $s ) ) { - $markerStart = strpos( $s, $this->mUniqPrefix, $i ); + $markerStart = strpos( $s, self::MARKER_PREFIX, $i ); if ( $markerStart === false ) { $out .= call_user_func( $callback, substr( $s, $i ) ); break; diff --git a/includes/parser/ParserDiffTest.php b/includes/parser/ParserDiffTest.php index 174c1d61ac..32f5d06826 100644 --- a/includes/parser/ParserDiffTest.php +++ b/includes/parser/ParserDiffTest.php @@ -29,7 +29,6 @@ class ParserDiffTest public $parsers; public $conf; public $shortOutput = false; - public $dtUniqPrefix; public function __construct( $conf ) { if ( !isset( $conf['parsers'] ) ) { @@ -43,12 +42,6 @@ class ParserDiffTest return; } - global $wgHooks; - static $doneHook = false; - if ( !$doneHook ) { - $doneHook = true; - $wgHooks['ParserClearState'][] = array( $this, 'onClearState' ); - } if ( isset( $this->conf['shortOutput'] ) ) { $this->shortOutput = $this->conf['shortOutput']; } @@ -126,18 +119,4 @@ class ParserDiffTest $parser->setFunctionHook( $id, $callback, $flags ); } } - - /** - * @param Parser $parser - * @return bool - */ - public function onClearState( &$parser ) { - // hack marker prefixes to get identical output - if ( !isset( $this->dtUniqPrefix ) ) { - $this->dtUniqPrefix = $parser->uniqPrefix(); - } else { - $parser->mUniqPrefix = $this->dtUniqPrefix; - } - return true; - } } diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php index caef648942..ff34d9b535 100644 --- a/includes/parser/Preprocessor_DOM.php +++ b/includes/parser/Preprocessor_DOM.php @@ -853,7 +853,8 @@ class PPDStackElement { $close, // Matching closing character $count, // Number of opening characters found (number of "=" for heading) $parts, // Array of PPDPart objects describing pipe-separated parts. - $lineStart; // True if the open char appeared at the start of the input line. Not set for headings. + $lineStart; // True if the open char appeared at the start of the input line. + // Not set for headings. public $partClass = 'PPDPart'; @@ -1271,7 +1272,7 @@ class PPFrame_DOM implements PPFrame { $titleText = $this->title->getPrefixedDBkey(); $this->parser->mHeadings[] = array( $titleText, $headingIndex ); $serial = count( $this->parser->mHeadings ) - 1; - $marker = "{$this->parser->mUniqPrefix}-h-$serial-" . Parser::MARKER_SUFFIX; + $marker = Parser::MARKER_PREFIX . "-h-$serial-" . Parser::MARKER_SUFFIX; $count = $contextNode->getAttribute( 'level' ); $s = substr( $s, 0, $count ) . $marker . substr( $s, $count ); $this->parser->mStripState->addGeneral( $marker, '' ); diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php index d32fb574ef..308ef44c99 100644 --- a/includes/parser/Preprocessor_Hash.php +++ b/includes/parser/Preprocessor_Hash.php @@ -112,7 +112,6 @@ class Preprocessor_Hash implements Preprocessor { * @return PPNode_Hash_Tree */ public function preprocessToObj( $text, $flags = 0 ) { - // Check cache. global $wgMemc, $wgPreprocessorCacheThreshold; @@ -1185,7 +1184,7 @@ class PPFrame_Hash implements PPFrame { $titleText = $this->title->getPrefixedDBkey(); $this->parser->mHeadings[] = array( $titleText, $bits['i'] ); $serial = count( $this->parser->mHeadings ) - 1; - $marker = "{$this->parser->mUniqPrefix}-h-$serial-" . Parser::MARKER_SUFFIX; + $marker = Parser::MARKER_PREFIX . "-h-$serial-" . Parser::MARKER_SUFFIX; $s = substr( $s, 0, $bits['level'] ) . $marker . substr( $s, $bits['level'] ); $this->parser->mStripState->addGeneral( $marker, '' ); $out .= $s; diff --git a/includes/parser/StripState.php b/includes/parser/StripState.php index 7e38acc70c..b11dc8c388 100644 --- a/includes/parser/StripState.php +++ b/includes/parser/StripState.php @@ -37,15 +37,20 @@ class StripState { const UNSTRIP_RECURSION_LIMIT = 20; /** - * @param string $prefix + * @param string|null $prefix + * @since 1.26 The prefix argument should be omitted, as the strip marker + * prefix string is now a constant. */ - public function __construct( $prefix ) { - $this->prefix = $prefix; + public function __construct( $prefix = null ) { + if ( $prefix !== null ) { + wfDeprecated( __METHOD__ . ' with called with $prefix argument' . + ' (call with no arguments instead)', '1.26' ); + } $this->data = array( 'nowiki' => array(), 'general' => array() ); - $this->regex = "/{$this->prefix}([^\x7f]+)" . Parser::MARKER_SUFFIX . '/'; + $this->regex = '/' . Parser::MARKER_PREFIX . "([^\x7f]+)" . Parser::MARKER_SUFFIX . '/'; $this->circularRefGuard = array(); } @@ -166,10 +171,10 @@ class StripState { * @return StripState */ public function getSubState( $text ) { - $subState = new StripState( $this->prefix ); + $subState = new StripState(); $pos = 0; while ( true ) { - $startPos = strpos( $text, $this->prefix, $pos ); + $startPos = strpos( $text, Parser::MARKER_PREFIX, $pos ); $endPos = strpos( $text, Parser::MARKER_SUFFIX, $pos ); if ( $startPos === false || $endPos === false ) { break; @@ -202,7 +207,7 @@ class StripState { * @return array */ public function merge( $otherState, $texts ) { - $mergePrefix = Parser::getRandomString(); + $mergePrefix = wfRandomString( 16 ); foreach ( $otherState->data as $type => $items ) { foreach ( $items as $key => $value ) { @@ -222,7 +227,7 @@ class StripState { */ protected function mergeCallback( $m ) { $key = $m[1]; - return "{$this->prefix}{$this->tempMergePrefix}-$key" . Parser::MARKER_SUFFIX; + return Parser::MARKER_PREFIX . $this->tempMergePrefix . '-' . $key . Parser::MARKER_SUFFIX; } /** diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index 844888ee7d..a6b687cfae 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -355,12 +355,7 @@ class LanguageConverter { 2. HTML entities 3. placeholders created by the parser */ - global $wgParser; - if ( isset( $wgParser ) && $wgParser->UniqPrefix() != '' ) { - $marker = '|' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+'; - } else { - $marker = ''; - } + $marker = '|' . Parser::MARKER_PREFIX . '[\-a-zA-Z0-9]+'; // this one is needed when the text is inside an HTML markup $htmlfix = '|<[^>]+$|^[^<>]*>';