From 4decbc29347f5b61cdab79a89f5d4e84e31f8a88 Mon Sep 17 00:00:00 2001
From: Tim Starling
Date: Sat, 5 Jan 2008 12:39:12 +0000
Subject: [PATCH] =?utf8?q?*=20Merged=20comment=20handling=20with=20the=20m?=
=?utf8?q?ain=20loop=20of=20preprocessToDom().=20This=20fixes=20a=20sectio?=
=?utf8?q?n=20numbering/marking=20regression=20introduced=20in=20r28588.?=
=?utf8?q?=20Added=20parser=20tests=20demonstrating=20the=20issue.=20*=20M?=
=?utf8?q?erged=20includeonly/noinclude/onlyinclude=20handling=20with=20pr?=
=?utf8?q?eprocessToDom(),=20and=20used=20the=20resulting=20=C3=BCberparse?=
=?utf8?q?r=20to=20fix=20another=20section=20numbering=20bug:=20bug=206563?=
=?utf8?q?.=20The=20fix=20involves=20putting=20a=20template=20flag=20"T"?=
=?utf8?q?=20into=20the=20section=20parameter=20of=20edit=20links.=20This?=
=?utf8?q?=20flag=20indicates=20to=20extractSections()=20how=20=20etc.=20should=20be=20handled.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit
If these two changes stick, I'll eventually describe the precise syntactic effects in RELEASE-NOTES.
* Added splitExtNode() for future use in LabeledSectionTransclusion.
* Added parser tests for bug 6563.
---
includes/Parser.php | 261 ++++++++++++++++++++++++++----------
maintenance/parserTests.txt | 64 ++++++++-
2 files changed, 251 insertions(+), 74 deletions(-)
diff --git a/includes/Parser.php b/includes/Parser.php
index 6e9813b567..a14cace280 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -71,6 +71,9 @@ class Parser
const COLON_STATE_COMMENTDASH = 6;
const COLON_STATE_COMMENTDASHDASH = 7;
+ // Flags for preprocessToDom
+ const PTD_FOR_INCLUSION = 1;
+
/**#@+
* @private
*/
@@ -928,11 +931,6 @@ class Parser
return $text ;
}
- # Remove tags and sections
- $text = strtr( $text, array( '' => '' , '' => '' ) );
- $text = strtr( $text, array( '' => '', '' => '') );
- $text = StringUtils::delimiterReplace( '', '', '', $text );
-
$text = $this->replaceVariables( $text );
$text = Sanitizer::removeHTMLtags( $text, array( &$this, 'attributeStripCallback' ), false, array_keys( $this->mTransparentTagHooks ) );
wfRunHooks( 'InternalParseBeforeLinks', array( &$this, &$text, &$this->mStripState ) );
@@ -2541,17 +2539,32 @@ class Parser
}
/**
- * Parse any parentheses in format ((title|part|part)} and return the document tree
+ * Preprocess some wikitext and return the document tree.
* This is the ghost of replace_variables().
*
* @param string $text The text to parse
+ * @param integer flags Bitwise combination of:
+ * self::PTD_FOR_INCLUSION Handle / as if the text is being
+ * included. Default is to assume a direct page view.
+ *
+ * The generated DOM tree must depend only on the input text, the flags, and $this->ot['msg'].
+ * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of bug 4899.
+ *
+ * Any flag added to the $flags parameter here, or any other parameter liable to cause a
+ * change in the DOM tree for a given text, must be passed through the section identifier
+ * in the section edit link and thus back to extractSections().
+ *
+ * The output of this function is currently only cached in process memory, but a persistent
+ * cache may be implemented at a later date which takes further advantage of these strict
+ * dependency requirements.
+ *
* @private
*/
- function preprocessToDom ( $text ) {
+ function preprocessToDom ( $text, $flags = 0 ) {
wfProfileIn( __METHOD__ );
wfProfileIn( __METHOD__.'-makexml' );
- static $msgRules, $normalRules;
+ static $msgRules, $normalRules, $inclusionSupertags, $nonInclusionSupertags;
if ( !$msgRules ) {
$msgRules = array(
'{' => array(
@@ -2592,19 +2605,32 @@ class Parser
} else {
$rules = $normalRules;
}
+ $forInclusion = $flags & self::PTD_FOR_INCLUSION;
- if ( $this->ot['html'] || ( $this->ot['pre'] && $this->mOptions->getRemoveComments() ) ) {
- $text = Sanitizer::removeHTMLcomments( $text );
+ $xmlishElements = $this->getStripList();
+ $enableOnlyinclude = false;
+ if ( $forInclusion ) {
+ $ignoredTags = array( 'includeonly', '/includeonly' );
+ $ignoredElements = array( 'noinclude' );
+ $xmlishElements[] = 'noinclude';
+ if ( strpos( $text, '' ) !== false && strpos( $text, '' ) !== false ) {
+ $enableOnlyinclude = true;
+ }
+ } else {
+ $ignoredTags = array( 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' );
+ $ignoredElements = array( 'includeonly' );
+ $xmlishElements[] = 'includeonly';
}
+ $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
- $extElements = implode( '|', $this->getStripList() );
// Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
- $extElementsRegex = "/($extElements)(?:\s|\/>|>)|(!--)/iA";
+ $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA";
$stack = array(); # Stack of unclosed parentheses
$stackIndex = -1; # Stack read pointer
$searchBase = implode( '', array_keys( $rules ) ) . '<';
+ $revText = strrev( $text ); // For fast reverse searches
$i = -1; # Input pointer, starts out pointing to a pseudo-newline before the start
$topAccum = ''; # Top level text accumulator
@@ -2614,8 +2640,27 @@ class Parser
$findPipe = false; # True to take notice of pipe characters
$headingIndex = 1;
$noMoreGT = false; # True if there are no more greater-than (>) signs right of $i
+ $findOnlyinclude = $enableOnlyinclude; # True to ignore all input up to the next
+
+ if ( $enableOnlyinclude ) {
+ $i = 0;
+ }
+
+ while ( true ) {
+ if ( $findOnlyinclude ) {
+ // Ignore all input up to the next
+ $startPos = strpos( $text, '', $i );
+ if ( $startPos === false ) {
+ // Ignored section runs to the end
+ $accum .= '' . htmlspecialchars( substr( $text, $i ) ) . '';
+ break;
+ }
+ $tagEndPos = $startPos + strlen( '' ); // past-the-end
+ $accum .= '' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i ) ) . '';
+ $i = $tagEndPos;
+ $findOnlyinclude = false;
+ }
- while ( $i < strlen( $text ) ) {
if ( $i == -1 ) {
$found = 'line-start';
$curChar = '';
@@ -2684,8 +2729,14 @@ class Parser
if ( $found == 'angle' ) {
$matches = false;
+ // Handle
+ if ( $enableOnlyinclude && substr( $text, $i, strlen( '' ) ) == '' ) {
+ $findOnlyinclude = true;
+ continue;
+ }
+
// Determine element name
- if ( !preg_match( $extElementsRegex, $text, $matches, 0, $i + 1 ) ) {
+ if ( !preg_match( $elementsRegex, $text, $matches, 0, $i + 1 ) ) {
// Element name missing or not listed
$accum .= '<';
++$i;
@@ -2693,21 +2744,37 @@ class Parser
}
// Handle comments
if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
- // HTML comment, scan to end
- $endpos = strpos( $text, '-->', $i + 4 );
- if ( $endpos === false ) {
+ // To avoid leaving blank lines, when a comment is both preceded
+ // and followed by a newline (ignoring spaces), trim leading and
+ // trailing spaces and one of the newlines.
+
+ // Find the end
+ $endPos = strpos( $text, '-->', $i + 4 );
+ if ( $endPos === false ) {
// Unclosed comment in input, runs to end
$inner = substr( $text, $i );
- if ( $this->ot['html'] ) {
- // Close it so later stripping can remove it
- $inner .= '-->';
- }
$accum .= '' . htmlspecialchars( $inner ) . '';
$i = strlen( $text );
} else {
- $inner = substr( $text, $i, $endpos - $i + 3 );
+ // Search backwards for leading whitespace
+ $wsStart = $i ? ( $i - strspn( $revText, ' ', strlen( $text ) - $i - 1 ) ) : 0;
+ // Search forwards for trailing whitespace
+ // $wsEnd will be the position of the last space
+ $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
+ // Eat the line if possible
+ if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
+ && substr( $text, $wsEnd + 1, 1 ) == "\n" )
+ {
+ $startPos = $wsStart;
+ $endPos = $wsEnd + 1;
+ } else {
+ // No line to eat, just take the comment itself
+ $startPos = $i;
+ $endPos += 2;
+ }
+ $inner = substr( $text, $startPos, $endPos - $startPos + 1 );
$accum .= '' . htmlspecialchars( $inner ) . '';
- $i = $endpos + 3;
+ $i = $endPos + 1;
}
continue;
}
@@ -2724,6 +2791,15 @@ class Parser
++$i;
continue;
}
+
+ // Handle ignored tags
+ if ( in_array( $name, $ignoredTags ) ) {
+ $accum .= '' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i + 1 ) ) . '';
+ $i = $tagEndPos + 1;
+ continue;
+ }
+
+ $tagStartPos = $i;
if ( $text[$tagEndPos-1] == '/' ) {
$attrEnd = $tagEndPos - 1;
$inner = null;
@@ -2743,6 +2819,13 @@ class Parser
$close = '';
}
}
+ // and just become tags
+ if ( in_array( $name, $ignoredElements ) ) {
+ $accum .= '' . htmlspecialchars( substr( $text, $tagStartPos, $i - $tagStartPos ) )
+ . '';
+ continue;
+ }
+
$accum .= '';
if ( $attrEnd <= $attrStart ) {
$attr = '';
@@ -2784,13 +2867,11 @@ class Parser
// A heading must be open, otherwise \n wouldn't have been in the search list
assert( $piece['open'] == "\n" );
assert( $stackIndex == 0 );
- // Search back through the accumulator to see if it has a proper close
- // No efficient way to do this in PHP AFAICT: strrev, PCRE search with $ anchor
- // and rtrim are all O(N) in total size. Optimal would be O(N) in trailing
- // whitespace size only.
+ // Search back through the input to see if it has a proper close
+ // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
$m = false;
$count = $piece['count'];
- if ( preg_match( "/(={{$count}})\s*$/", $accum, $m, 0, $count ) ) {
+ if ( preg_match( "/\s*(={{$count}})/A", $revText, $m, 0, strlen( $text ) - $i ) ) {
// Found match, output
$count = min( strlen( $m[1] ), $count );
$element = "$accum";
@@ -3021,27 +3102,6 @@ class Parser
return array( $w1, $trimmed, $w2 );
}
- /**
- * Convert text to a document tree, like preprocessToDom(), but with some special handling
- * assuming the source text is from a template -- specifically noinclude/includeonly behaviour.
- */
- function preprocessTplToDom( $text ) {
- # If there are any tags, only include them
- if ( !$this->ot['msg'] ) {
- if ( in_string( '', $text ) && in_string( '', $text ) ) {
- $replacer = new OnlyIncludeReplacer;
- StringUtils::delimiterReplaceCallback( '', '',
- array( &$replacer, 'replace' ), $text );
- $text = $replacer->output;
- }
- # Remove sections and tags
- $text = StringUtils::delimiterReplace( '', '', '', $text );
- $text = strtr( $text, array( '' => '' , '' => '' ) );
- }
-
- return $this->preprocessToDom( $text );
- }
-
/**
* Replace magic variables, templates, and template arguments
* with the appropriate text. Templates are substituted recursively,
@@ -3311,7 +3371,7 @@ class Parser
} else {
$text = $this->interwikiTransclude( $title, 'raw' );
// Preprocess it like a template
- $text = $this->preprocessTplToDom( $text );
+ $text = $this->preprocessToDom( $text, self::PTD_FOR_INCLUSION );
$isDOM = true;
}
$found = true;
@@ -3404,7 +3464,7 @@ class Parser
return array( false, $title );
}
- $dom = $this->preprocessTplToDom( $text );
+ $dom = $this->preprocessToDom( $text, self::PTD_FOR_INCLUSION );
$this->mTplDomCache[ $titleText ] = $dom;
if (! $title->equals($cacheTitle)) {
@@ -3906,10 +3966,13 @@ class Parser
}
# give headline the correct tag
if( $showEditLink && $sectionIndex !== false ) {
- if( $isTemplate )
- $editlink = $sk->editSectionLinkForOther($titleText, $sectionIndex);
- else
+ if( $isTemplate ) {
+ # Put a T flag in the section identifier, to indicate to extractSections()
+ # that sections inside should be counted.
+ $editlink = $sk->editSectionLinkForOther($titleText, "T-$sectionIndex");
+ } else {
$editlink = $sk->editSectionLink($this->mTitle, $sectionIndex, $headlineHint);
+ }
} else {
$editlink = '';
}
@@ -4910,14 +4973,22 @@ class Parser
*
* External callers should use the getSection and replaceSection methods.
*
- * @param $text Page wikitext
- * @param $section Numbered section. 0 pulls the text before the first
- * heading; other numbers will pull the given section
- * along with its lower-level subsections. If the section is
- * not found, $mode=get will return $newtext, and
- * $mode=replace will return $text.
- * @param $mode One of "get" or "replace"
- * @param $newText Replacement text for section data.
+ * @param string $text Page wikitext
+ * @param string $section A section identifier string of the form:
+ * - - ... -
+ *
+ * Currently the only recognised flag is "T", which means the target section number
+ * was derived during a template inclusion parse, in other words this is a template
+ * section edit link. If no flags are given, it was an ordinary section edit link.
+ * This flag is required to avoid a section numbering mismatch when a section is
+ * enclosed by (bug 6563).
+ *
+ * The section number 0 pulls the text before the first heading; other numbers will
+ * pull the given section along with its lower-level subsections. If the section is
+ * not found, $mode=get will return $newtext, and $mode=replace will return $text.
+ *
+ * @param string $mode One of "get" or "replace"
+ * @param string $newText Replacement text for section data.
* @return string for "get", the extracted section text.
* for "replace", the whole page with the section replaced.
*/
@@ -4931,8 +5002,17 @@ class Parser
$outText = '';
$frame = new PPFrame( $this );
+ // Process section extraction flags
+ $flags = 0;
+ $sectionParts = explode( '-', $section );
+ $sectionIndex = array_pop( $sectionParts );
+ foreach ( $sectionParts as $part ) {
+ if ( $part == 'T' ) {
+ $flags |= self::PTD_FOR_INCLUSION;
+ }
+ }
// Preprocess the text
- $dom = $this->preprocessToDom( $text );
+ $dom = $this->preprocessToDom( $text, $flags );
$root = $dom->documentElement;
// nodes indicate section breaks
@@ -4940,13 +5020,13 @@ class Parser
$node = $root->firstChild;
// Find the target section
- if ( $section == 0 ) {
+ if ( $sectionIndex == 0 ) {
// Section zero doesn't nest, level=big
$targetLevel = 1000;
} else {
while ( $node ) {
if ( $node->nodeName == 'h' ) {
- if ( $curIndex + 1 == $section ) {
+ if ( $curIndex + 1 == $sectionIndex ) {
break;
}
$curIndex++;
@@ -4975,7 +5055,7 @@ class Parser
if ( $node->nodeName == 'h' ) {
$curIndex++;
$curLevel = $node->getAttribute( 'level' );
- if ( $curIndex != $section && $curLevel <= $targetLevel ) {
+ if ( $curIndex != $sectionIndex && $curLevel <= $targetLevel ) {
break;
}
}
@@ -5012,9 +5092,9 @@ class Parser
*
* If a section contains subsections, these are also returned.
*
- * @param $text String: text to look in
- * @param $section Integer: section number
- * @param $deftext: default to return if section is not found
+ * @param string $text text to look in
+ * @param string $section section identifier
+ * @param string $deftext default to return if section is not found
* @return string text of the requested section
*/
public function getSection( $text, $section, $deftext='' ) {
@@ -5217,8 +5297,9 @@ class PPFrame {
const NO_ARGS = 1;
const NO_TEMPLATES = 2;
const STRIP_COMMENTS = 4;
+ const NO_IGNORE = 8;
- const RECOVER_ORIG = 3;
+ const RECOVER_ORIG = 11;
/**
* Construct a new preprocessor frame.
@@ -5323,11 +5404,24 @@ class PPFrame {
}
} elseif ( $root->nodeName == 'comment' ) {
# HTML-style comment
- if ( $flags & self::STRIP_COMMENTS ) {
+ if ( $this->parser->ot['html']
+ || ( $this->parser->ot['pre'] && $this->mOptions->getRemoveComments() )
+ || ( $flags & self::STRIP_COMMENTS ) )
+ {
$s = '';
} else {
$s = $root->textContent;
}
+ } elseif ( $root->nodeName == 'ignore' ) {
+ # Output suppression used by etc.
+ # OT_WIKI will only respect in substed templates.
+ # The other output types respect it unless NO_IGNORE is set.
+ # extractSections() sets NO_IGNORE and so never respects it.
+ if ( ( !isset( $this->parent ) && $this->parser->ot['wiki'] ) || ( $flags & self::NO_IGNORE ) ) {
+ $s = $root->textContent;
+ } else {
+ $s = '';
+ }
} elseif ( $root->nodeName == 'ext' ) {
# Extension tag
$xpath = new DOMXPath( $root->ownerDocument );
@@ -5417,6 +5511,31 @@ class PPFrame {
return array( $name, $index, $values->item( 0 ) );
}
+ /**
+ * Split an node into an associative array containing name, attr, inner and close
+ * All values in the resulting array are DOMNodes. Inner and close are optional.
+ */
+ function splitExtNode( $node ) {
+ $xpath = new DOMXPath( $node->ownerDocument );
+ $names = $xpath->query( 'name', $node );
+ $attrs = $xpath->query( 'attr', $node );
+ $inners = $xpath->query( 'inner', $node );
+ $closes = $xpath->query( 'close', $node );
+ if ( !$names->length || !$attrs->length ) {
+ throw new MWException( 'Invalid ext node passed to ' . __METHOD__ );
+ }
+ $parts = array(
+ 'name' => $names->item( 0 ),
+ 'attr' => $attrs->item( 0 ) );
+ if ( $inners->length ) {
+ $parts['inner'] = $inners->item( 0 );
+ }
+ if ( $closes->length ) {
+ $parts['close'] = $closes->item( 0 );
+ }
+ return $parts;
+ }
+
function __toString() {
return 'frame{}';
}
diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt
index 836ad4fc00..28f30231d7 100644
--- a/maintenance/parserTests.txt
+++ b/maintenance/parserTests.txt
@@ -2575,6 +2575,64 @@ Foozarbar
!! end
+!! article
+Template:Includeonly section
+!! text
+
+==Includeonly section==
+
+==Section T-1==
+!!endarticle
+
+!! test
+Bug 6563: Edit link generation for section shown by
+!! input
+{{includeonly section}}
+!! result
+[edit] Includeonly section
+[edit] Section T-1
+
+!! end
+
+# Uses same input as the contents of [[Template:Includeonly section]]
+!! test
+Bug 6563: Section extraction for section shown by
+!! options
+section=T-2
+!! input
+
+==Includeonly section==
+
+==Section T-2==
+!! result
+==Section T-2==
+!! end
+
+!! test
+Bug 6563: Edit link generation for section suppressed by
+!! input
+
+==Includeonly section==
+
+==Section 1==
+!! result
+[edit] Section 1
+
+!! end
+
+!! test
+Bug 6563: Section extraction for section suppressed by
+!! options
+section=1
+!! input
+
+==Includeonly section==
+
+==Section 1==
+!! result
+==Section 1==
+!! end
+
###
### Pre-save transform tests
###
@@ -3504,8 +3562,8 @@ __NOTOC__
==Section 4==
!! result
[edit] Section 0
-[edit] Section 1
-[edit] Section 2
+[edit] Section 1
+[edit] Section 2
[edit] Section 4
!! end
@@ -6223,7 +6281,7 @@ Inclusion of !userCanEdit() content
!! input
{{MediaWiki:Fake}}
!! result
-[edit] header
+[edit] header
!! end
--
2.20.1