Generating one-time, unique strip markers hurts us in multiple ways:
* The strip marker regexes don't benefit from JIT compilation, so they are
slower to execute than they could be.
* Although the regexes don't benefit from JIT compilation, they are still
compiled, because HHVM bets on regexes getting reused. This extra work is
fairly costly (1-2% of CPU usage on the app servers) and doesn't pay off.
* The size of the PCRE JIT cache is finite, and the caching of one-off regexes
displaces from the cache regexes which are in fact reused.
Tim's preferred solution (per his review comment on
https://gerrit.wikimedia.org/r/167530/) is to use fixed strip markers.
So:
* Replace usage of $parser->mUniqPrefix with Parser::MARKER_PREFIX, which
complements the existing Parser::MARKER_SUFFIX.
* Deprecate Parser::mUniqPrefix and its accessor, Parser::uniqPrefix().
* Deprecate Parser::getRandomString(), since it is no longer useful.
* In Preprocessor_*:preprocessToObj() and Parser::fetchTemplateAndTitle,
replace any occurences of \x7f with '?', to prevent strip marker forgery.
\x7f is not valid input anyway.
* Deprecate the $prefix parameter for StripState::__construct, since a custom
prefix may no longer be specified.
Change-Id: I31d4556bbb07acb72c33fda335fa5a230379a03f
* Removed maintenance script deleteImageMemcached.php.
* MWFunction::newObj() was removed (deprecated in 1.25).
ObjectFactory::getObjectFromSpec() should be used instead.
* Removed maintenance script deleteImageMemcached.php.
* MWFunction::newObj() was removed (deprecated in 1.25).
ObjectFactory::getObjectFromSpec() should be used instead.
+* The parser will no longer randomize the string it uses to mark the place of
+ items that were stripped during parsing. It will use a fixed string instead.
+ This causes the parser to re-use the regular expressions it uses to search
+ and replace markers rather than generate novel expressions on each parse.
+ Re-using regular expressions will improve performance on HHVM and the
+ forthcoming PHP 7. The interfaces changes accompanying this change are:
+ - Parser::getRandomString() and Parser::uniqPrefix() have been deprecated.
+ - The $uniq_prefix argument for Parser::extractTagsAndParams() and the
+ $prefix argument for StripState::_construct() are deprecated and their
+ value is ignored.
+
- protected $mUniqPrefix;
-
protected $mMarkerIndex;
public function __construct() {
$this->mTokens = null;
protected $mMarkerIndex;
public function __construct() {
$this->mTokens = null;
- $this->mUniqPrefix = null;
*/
public function getWrapped( $text ) {
$this->mTokens = new ReplacementArray;
*/
public function getWrapped( $text ) {
$this->mTokens = new ReplacementArray;
- $this->mUniqPrefix = "\x7fUNIQ" .
- dechex( mt_rand( 0, 0x7fffffff ) ) . dechex( mt_rand( 0, 0x7fffffff ) );
$this->mMarkerIndex = 0;
// Replace <mw:editsection> elements with placeholders
$this->mMarkerIndex = 0;
// Replace <mw:editsection> elements with placeholders
* @return string
*/
public function replaceCallback( $m ) {
* @return string
*/
public function replaceCallback( $m ) {
- $marker = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
+ $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
$this->mMarkerIndex++;
$this->mTokens->setPair( $marker, $m[0] );
return $marker;
$this->mMarkerIndex++;
$this->mTokens->setPair( $marker, $m[0] );
return $marker;
const OT_MSG = 3;
const OT_PLAIN = 4; # like extractSections() - portions of the original are returned unchanged.
const OT_MSG = 3;
const OT_PLAIN = 4; # like extractSections() - portions of the original are returned unchanged.
- # Marker Suffix needs to be accessible staticly.
+ /**
+ * @var string Prefix and suffix for temporary replacement strings
+ * for the multipass parser.
+ *
+ * \x7f should never appear in input as it's disallowed in XML.
+ * Using it at the front also gives us a little extra robustness
+ * since it shouldn't match when butted up against identifier-like
+ * string constructs.
+ *
+ * Must not consist of all title characters, or else it will change
+ * the behavior of <nowiki> in a link.
+ */
const MARKER_SUFFIX = "-QINU\x7f";
const MARKER_SUFFIX = "-QINU\x7f";
+ const MARKER_PREFIX = "\x7fUNIQ-";
# Markers used for wrapping the table of contents
const TOC_START = '<mw:toc>';
# Markers used for wrapping the table of contents
const TOC_START = '<mw:toc>';
public $mInputSize = false; # For {{PAGESIZE}} on current page.
/**
public $mInputSize = false; # For {{PAGESIZE}} on current page.
/**
- * @var string
- */
- public $mUniqPrefix;
+ * @var string Deprecated accessor for the strip marker prefix.
+ * @deprecated since 1.26; use Parser::MARKER_PREFIX instead.
+ **/
+ public $mUniqPrefix = Parser::MARKER_PREFIX;
/**
* @var array Array with the language name of each language link (i.e. the
/**
* @var array Array with the language name of each language link (i.e. the
$this->mLangLinkLanguages = array();
$this->currentRevisionCache = null;
$this->mLangLinkLanguages = array();
$this->currentRevisionCache = null;
- /**
- * Prefix for temporary replacement strings for the multipass parser.
- * \x07 should never appear in input as it's disallowed in XML.
- * Using it at the front also gives us a little extra robustness
- * since it shouldn't match when butted up against identifier-like
- * string constructs.
- *
- * Must not consist of all title characters, or else it will change
- * the behavior of <nowiki> in a link.
- */
- $this->mUniqPrefix = "\x7fUNIQ" . self::getRandomString();
- $this->mStripState = new StripState( $this->mUniqPrefix );
+ $this->mStripState = new StripState;
# Clear these on every parse, bug 4549
$this->mTplRedirCache = $this->mTplDomCache = array();
# Clear these on every parse, bug 4549
$this->mTplRedirCache = $this->mTplDomCache = array();
global $wgShowHostnames;
if ( $clearState ) {
global $wgShowHostnames;
if ( $clearState ) {
+ // We use U+007F DELETE to construct strip markers, so we have to make
+ // sure that this character does not occur in the input text.
+ $text = strtr( $text, "\x7f", "?" );
$magicScopeVariable = $this->lock();
}
$magicScopeVariable = $this->lock();
}
$this->mOutput->resetParseStartTime();
}
$this->mOutput->resetParseStartTime();
}
- # Remove the strip marker tag prefix from the input, if present.
- if ( $clearState ) {
- $text = str_replace( $this->mUniqPrefix, '', $text );
- }
-
$oldRevisionId = $this->mRevisionId;
$oldRevisionObject = $this->mRevisionObject;
$oldRevisionTimestamp = $this->mRevisionTimestamp;
$oldRevisionId = $this->mRevisionId;
$oldRevisionObject = $this->mRevisionObject;
$oldRevisionTimestamp = $this->mRevisionTimestamp;
* Get a random string
*
* @return string
* Get a random string
*
* @return string
+ * @deprecated since 1.26; use wfRandomString() instead.
*/
public static function getRandomString() {
*/
public static function getRandomString() {
+ wfDeprecated( __METHOD__, '1.26' );
return wfRandomString( 16 );
}
return wfRandomString( 16 );
}
* Accessor for mUniqPrefix.
*
* @return string
* Accessor for mUniqPrefix.
*
* @return string
+ * @deprecated since 1.26; use Parser::MARKER_PREFIX instead.
*/
public function uniqPrefix() {
*/
public function uniqPrefix() {
- if ( !isset( $this->mUniqPrefix ) ) {
- # @todo FIXME: This is probably *horribly wrong*
- # LanguageConverter seems to want $wgParser's uniqPrefix, however
- # if this is called for a parser cache hit, the parser may not
- # have ever been initialized in the first place.
- # Not really sure what the heck is supposed to be going on here.
- return '';
- # throw new MWException( "Accessing uninitialized mUniqPrefix" );
- }
- return $this->mUniqPrefix;
+ wfDeprecated( __METHOD__, '1.26' );
+ return self::MARKER_PREFIX;
* @param array $elements List of element names. Comments are always extracted.
* @param string $text Source text string.
* @param array $matches Out parameter, Array: extracted tags
* @param array $elements List of element names. Comments are always extracted.
* @param string $text Source text string.
* @param array $matches Out parameter, Array: extracted tags
- * @param string $uniq_prefix
+ * @param string|null $uniq_prefix
* @return string Stripped text
* @return string Stripped text
+ * @since 1.26 The uniq_prefix argument is deprecated.
- public static function extractTagsAndParams( $elements, $text, &$matches, $uniq_prefix = '' ) {
+ public static function extractTagsAndParams( $elements, $text, &$matches, $uniq_prefix = null ) {
+ if ( $uniq_prefix !== null ) {
+ wfDeprecated( __METHOD__ . ' called with $prefix argument', '1.26' );
+ }
static $n = 1;
$stripped = '';
$matches = array();
static $n = 1;
$stripped = '';
$matches = array();
- $marker = "$uniq_prefix-$element-" . sprintf( '%08X', $n++ ) . self::MARKER_SUFFIX;
+ $marker = self::MARKER_PREFIX . "-$element-" . sprintf( '%08X', $n++ ) . self::MARKER_SUFFIX;
$stripped .= $marker;
if ( $close === '/>' ) {
$stripped .= $marker;
if ( $close === '/>' ) {
* @return string
*/
public function insertStripItem( $text ) {
* @return string
*/
public function insertStripItem( $text ) {
- $rnd = "{$this->mUniqPrefix}-item-{$this->mMarkerIndex}-" . self::MARKER_SUFFIX;
+ $marker = self::MARKER_PREFIX . "-item-{$this->mMarkerIndex}-" . self::MARKER_SUFFIX;
- $this->mStripState->addGeneral( $rnd, $text );
- return $rnd;
+ $this->mStripState->addGeneral( $marker, $text );
+ return $marker;
# replaceInternalLinks may sometimes leave behind
# absolute URLs, which have to be masked to hide them from replaceExternalLinks
# replaceInternalLinks may sometimes leave behind
# absolute URLs, which have to be masked to hide them from replaceExternalLinks
- $text = str_replace( $this->mUniqPrefix . 'NOPARSE', '', $text );
+ $text = str_replace( self::MARKER_PREFIX . 'NOPARSE', '', $text );
$text = $this->doMagicLinks( $text );
$text = $this->formatHeadings( $text, $origText, $isMain );
$text = $this->doMagicLinks( $text );
$text = $this->formatHeadings( $text, $origText, $isMain );
*/
public function armorLinks( $text ) {
return preg_replace( '/\b((?i)' . $this->mUrlProtocols . ')/',
*/
public function armorLinks( $text ) {
return preg_replace( '/\b((?i)' . $this->mUrlProtocols . ')/',
- "{$this->mUniqPrefix}NOPARSE$1", $text );
+ self::MARKER_PREFIX . "NOPARSE$1", $text );
$closematch = preg_match(
'/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
. '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
$closematch = preg_match(
'/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
. '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
. '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS',
$t
);
. '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS',
$t
);
// Defaults to Parser::statelessFetchTemplate()
$templateCb = $this->mOptions->getTemplateCallback();
$stuff = call_user_func( $templateCb, $title, $this );
// Defaults to Parser::statelessFetchTemplate()
$templateCb = $this->mOptions->getTemplateCallback();
$stuff = call_user_func( $templateCb, $title, $this );
+ // We use U+007F DELETE to distinguish strip markers from regular text.
+ if ( is_string( $stuff['text'] ) ) {
+ $text = strtr( $text, "\x7f", "?" );
+ }
$finalTitle = isset( $stuff['finalTitle'] ) ? $stuff['finalTitle'] : $title;
if ( isset( $stuff['deps'] ) ) {
foreach ( $stuff['deps'] as $dep ) {
$finalTitle = isset( $stuff['finalTitle'] ) ? $stuff['finalTitle'] : $title;
if ( isset( $stuff['deps'] ) ) {
foreach ( $stuff['deps'] as $dep ) {
$name = $frame->expand( $params['name'] );
$attrText = !isset( $params['attr'] ) ? null : $frame->expand( $params['attr'] );
$content = !isset( $params['inner'] ) ? null : $frame->expand( $params['inner'] );
$name = $frame->expand( $params['name'] );
$attrText = !isset( $params['attr'] ) ? null : $frame->expand( $params['attr'] );
$content = !isset( $params['inner'] ) ? null : $frame->expand( $params['inner'] );
- $marker = "{$this->mUniqPrefix}-$name-"
+ $marker = self::MARKER_PREFIX . "-$name-"
. sprintf( '%08X', $this->mMarkerIndex++ ) . self::MARKER_SUFFIX;
$isFunctionTag = isset( $this->mFunctionTagHooks[strtolower( $name )] ) &&
. sprintf( '%08X', $this->mMarkerIndex++ ) . self::MARKER_SUFFIX;
$isFunctionTag = isset( $this->mFunctionTagHooks[strtolower( $name )] ) &&
$prevlevel = 0;
$toclevel = 0;
$prevtoclevel = 0;
$prevlevel = 0;
$toclevel = 0;
$prevtoclevel = 0;
- $markerRegex = "{$this->mUniqPrefix}-h-(\d+)-" . self::MARKER_SUFFIX;
+ $markerRegex = self::MARKER_PREFIX . "-h-(\d+)-" . self::MARKER_SUFFIX;
$baseTitleText = $this->mTitle->getPrefixedDBkey();
$oldType = $this->mOutputType;
$this->setOutputType( self::OT_WIKI );
$baseTitleText = $this->mTitle->getPrefixedDBkey();
$oldType = $this->mOutputType;
$this->setOutputType( self::OT_WIKI );
public function replaceTransparentTags( $text ) {
$matches = array();
$elements = array_keys( $this->mTransparentTagHooks );
public function replaceTransparentTags( $text ) {
$matches = array();
$elements = array_keys( $this->mTransparentTagHooks );
- $text = self::extractTagsAndParams( $elements, $text, $matches, $this->mUniqPrefix );
+ $text = self::extractTagsAndParams( $elements, $text, $matches );
$replacements = array();
foreach ( $matches as $marker => $data ) {
$replacements = array();
foreach ( $matches as $marker => $data ) {
$i = 0;
$out = '';
while ( $i < strlen( $s ) ) {
$i = 0;
$out = '';
while ( $i < strlen( $s ) ) {
- $markerStart = strpos( $s, $this->mUniqPrefix, $i );
+ $markerStart = strpos( $s, self::MARKER_PREFIX, $i );
if ( $markerStart === false ) {
$out .= call_user_func( $callback, substr( $s, $i ) );
break;
if ( $markerStart === false ) {
$out .= call_user_func( $callback, substr( $s, $i ) );
break;
public $parsers;
public $conf;
public $shortOutput = false;
public $parsers;
public $conf;
public $shortOutput = false;
public function __construct( $conf ) {
if ( !isset( $conf['parsers'] ) ) {
public function __construct( $conf ) {
if ( !isset( $conf['parsers'] ) ) {
- global $wgHooks;
- static $doneHook = false;
- if ( !$doneHook ) {
- $doneHook = true;
- $wgHooks['ParserClearState'][] = array( $this, 'onClearState' );
- }
if ( isset( $this->conf['shortOutput'] ) ) {
$this->shortOutput = $this->conf['shortOutput'];
}
if ( isset( $this->conf['shortOutput'] ) ) {
$this->shortOutput = $this->conf['shortOutput'];
}
$parser->setFunctionHook( $id, $callback, $flags );
}
}
$parser->setFunctionHook( $id, $callback, $flags );
}
}
-
- /**
- * @param Parser $parser
- * @return bool
- */
- public function onClearState( &$parser ) {
- // hack marker prefixes to get identical output
- if ( !isset( $this->dtUniqPrefix ) ) {
- $this->dtUniqPrefix = $parser->uniqPrefix();
- } else {
- $parser->mUniqPrefix = $this->dtUniqPrefix;
- }
- return true;
- }
$close, // Matching closing character
$count, // Number of opening characters found (number of "=" for heading)
$parts, // Array of PPDPart objects describing pipe-separated parts.
$close, // Matching closing character
$count, // Number of opening characters found (number of "=" for heading)
$parts, // Array of PPDPart objects describing pipe-separated parts.
- $lineStart; // True if the open char appeared at the start of the input line. Not set for headings.
+ $lineStart; // True if the open char appeared at the start of the input line.
+ // Not set for headings.
public $partClass = 'PPDPart';
public $partClass = 'PPDPart';
$titleText = $this->title->getPrefixedDBkey();
$this->parser->mHeadings[] = array( $titleText, $headingIndex );
$serial = count( $this->parser->mHeadings ) - 1;
$titleText = $this->title->getPrefixedDBkey();
$this->parser->mHeadings[] = array( $titleText, $headingIndex );
$serial = count( $this->parser->mHeadings ) - 1;
- $marker = "{$this->parser->mUniqPrefix}-h-$serial-" . Parser::MARKER_SUFFIX;
+ $marker = Parser::MARKER_PREFIX . "-h-$serial-" . Parser::MARKER_SUFFIX;
$count = $contextNode->getAttribute( 'level' );
$s = substr( $s, 0, $count ) . $marker . substr( $s, $count );
$this->parser->mStripState->addGeneral( $marker, '' );
$count = $contextNode->getAttribute( 'level' );
$s = substr( $s, 0, $count ) . $marker . substr( $s, $count );
$this->parser->mStripState->addGeneral( $marker, '' );
* @return PPNode_Hash_Tree
*/
public function preprocessToObj( $text, $flags = 0 ) {
* @return PPNode_Hash_Tree
*/
public function preprocessToObj( $text, $flags = 0 ) {
// Check cache.
global $wgMemc, $wgPreprocessorCacheThreshold;
// Check cache.
global $wgMemc, $wgPreprocessorCacheThreshold;
$titleText = $this->title->getPrefixedDBkey();
$this->parser->mHeadings[] = array( $titleText, $bits['i'] );
$serial = count( $this->parser->mHeadings ) - 1;
$titleText = $this->title->getPrefixedDBkey();
$this->parser->mHeadings[] = array( $titleText, $bits['i'] );
$serial = count( $this->parser->mHeadings ) - 1;
- $marker = "{$this->parser->mUniqPrefix}-h-$serial-" . Parser::MARKER_SUFFIX;
+ $marker = Parser::MARKER_PREFIX . "-h-$serial-" . Parser::MARKER_SUFFIX;
$s = substr( $s, 0, $bits['level'] ) . $marker . substr( $s, $bits['level'] );
$this->parser->mStripState->addGeneral( $marker, '' );
$out .= $s;
$s = substr( $s, 0, $bits['level'] ) . $marker . substr( $s, $bits['level'] );
$this->parser->mStripState->addGeneral( $marker, '' );
$out .= $s;
const UNSTRIP_RECURSION_LIMIT = 20;
/**
const UNSTRIP_RECURSION_LIMIT = 20;
/**
- * @param string $prefix
+ * @param string|null $prefix
+ * @since 1.26 The prefix argument should be omitted, as the strip marker
+ * prefix string is now a constant.
- public function __construct( $prefix ) {
- $this->prefix = $prefix;
+ public function __construct( $prefix = null ) {
+ if ( $prefix !== null ) {
+ wfDeprecated( __METHOD__ . ' with called with $prefix argument' .
+ ' (call with no arguments instead)', '1.26' );
+ }
$this->data = array(
'nowiki' => array(),
'general' => array()
);
$this->data = array(
'nowiki' => array(),
'general' => array()
);
- $this->regex = "/{$this->prefix}([^\x7f]+)" . Parser::MARKER_SUFFIX . '/';
+ $this->regex = '/' . Parser::MARKER_PREFIX . "([^\x7f]+)" . Parser::MARKER_SUFFIX . '/';
$this->circularRefGuard = array();
}
$this->circularRefGuard = array();
}
* @return StripState
*/
public function getSubState( $text ) {
* @return StripState
*/
public function getSubState( $text ) {
- $subState = new StripState( $this->prefix );
+ $subState = new StripState();
$pos = 0;
while ( true ) {
$pos = 0;
while ( true ) {
- $startPos = strpos( $text, $this->prefix, $pos );
+ $startPos = strpos( $text, Parser::MARKER_PREFIX, $pos );
$endPos = strpos( $text, Parser::MARKER_SUFFIX, $pos );
if ( $startPos === false || $endPos === false ) {
break;
$endPos = strpos( $text, Parser::MARKER_SUFFIX, $pos );
if ( $startPos === false || $endPos === false ) {
break;
* @return array
*/
public function merge( $otherState, $texts ) {
* @return array
*/
public function merge( $otherState, $texts ) {
- $mergePrefix = Parser::getRandomString();
+ $mergePrefix = wfRandomString( 16 );
foreach ( $otherState->data as $type => $items ) {
foreach ( $items as $key => $value ) {
foreach ( $otherState->data as $type => $items ) {
foreach ( $items as $key => $value ) {
*/
protected function mergeCallback( $m ) {
$key = $m[1];
*/
protected function mergeCallback( $m ) {
$key = $m[1];
- return "{$this->prefix}{$this->tempMergePrefix}-$key" . Parser::MARKER_SUFFIX;
+ return Parser::MARKER_PREFIX . $this->tempMergePrefix . '-' . $key . Parser::MARKER_SUFFIX;
2. HTML entities
3. placeholders created by the parser
*/
2. HTML entities
3. placeholders created by the parser
*/
- global $wgParser;
- if ( isset( $wgParser ) && $wgParser->UniqPrefix() != '' ) {
- $marker = '|' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+';
- } else {
- $marker = '';
- }
+ $marker = '|' . Parser::MARKER_PREFIX . '[\-a-zA-Z0-9]+';
// this one is needed when the text is inside an HTML markup
$htmlfix = '|<[^>]+$|^[^<>]*>';
// this one is needed when the text is inside an HTML markup
$htmlfix = '|<[^>]+$|^[^<>]*>';