From f089e20bc00f6f6e6cc15b4fd8f81bbe9c3dc102 Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Tue, 6 Aug 2013 20:21:00 -0400 Subject: [PATCH] Preprocessor: Don't treat a line containing multiple comments as a blank line. After this patch, 'a', 'b', and 'c' are all treated as members of the same list in the following wikitext: *a *b *c The old comment-removal rule was "trim a comment which is both preceded and followed by a newline (ignoring spaces)". This only works if there is a single comment on the line, and was often surprising to users. The new rule allows any number of whitespace-separated comments on the line. Bug: 41756 Change-Id: I6030086226e1eeece59643c29dbb4361668b4bd6 --- RELEASE-NOTES-1.22 | 1 + includes/parser/Preprocessor_DOM.php | 37 +++++++++++++++++++++++---- includes/parser/Preprocessor_Hash.php | 37 +++++++++++++++++++++++---- tests/parser/parserTests.txt | 23 +++++++++++------ 4 files changed, 80 insertions(+), 18 deletions(-) diff --git a/RELEASE-NOTES-1.22 b/RELEASE-NOTES-1.22 index 5637eb0b16..af467a0b90 100644 --- a/RELEASE-NOTES-1.22 +++ b/RELEASE-NOTES-1.22 @@ -252,6 +252,7 @@ production. * (bug 49694) $wgSpamRegex is now also applied on the new section headline text adding a new topic on a page * (bug 6200) line breaks in
are handled like they are in
+* (bug 41756) Improve treatment of multiple comments on a blank line. === API changes in 1.22 === * (bug 25553) The JSON output formatter now leaves forward slashes unescaped diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php index 809e7f7000..c9e16b3559 100644 --- a/includes/parser/Preprocessor_DOM.php +++ b/includes/parser/Preprocessor_DOM.php @@ -361,9 +361,11 @@ class Preprocessor_DOM implements Preprocessor { } // Handle comments if ( isset( $matches[2] ) && $matches[2] == '!--' ) { - // To avoid leaving blank lines, when a comment is both preceded - // and followed by a newline (ignoring spaces), trim leading and - // trailing spaces and one of the newlines. + + // To avoid leaving blank lines, when a sequence of + // space-separated comments is both preceded and followed by + // a newline (ignoring spaces), then + // trim leading and trailing spaces and the trailing newline. // Find the end $endPos = strpos( $text, '-->', $i + 4 ); @@ -375,9 +377,24 @@ class Preprocessor_DOM implements Preprocessor { } else { // Search backwards for leading whitespace $wsStart = $i ? ( $i - strspn( $revText, ' ', $lengthText - $i ) ) : 0; + // Search forwards for trailing whitespace // $wsEnd will be the position of the last space (or the '>' if there's none) $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 ); + + // Keep looking forward as long as we're finding more + // comments. + $comments = array( array( $wsStart, $wsEnd ) ); + while ( substr( $text, $wsEnd + 1, 4 ) == '', $wsEnd + 4 ); + if ( $c === false ) { + break; + } + $c = $c + 2 + strspn( $text, ' ', $c + 3 ); + $comments[] = array( $wsEnd + 1, $c ); + $wsEnd = $c; + } + // Eat the line if possible // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but @@ -385,14 +402,24 @@ class Preprocessor_DOM implements Preprocessor { if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n" && substr( $text, $wsEnd + 1, 1 ) == "\n" ) { - $startPos = $wsStart; - $endPos = $wsEnd + 1; // Remove leading whitespace from the end of the accumulator // Sanity check first though $wsLength = $i - $wsStart; if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) { $accum = substr( $accum, 0, -$wsLength ); } + + // Dump all but the last comment to the accumulator + foreach ( $comments as $j => $com ) { + $startPos = $com[0]; + $endPos = $com[1] + 1; + if ( $j == ( count( $comments ) - 1) ) { + break; + } + $inner = substr( $text, $startPos, $endPos - $startPos); + $accum .= '' . htmlspecialchars( $inner ) . ''; + } + // Do a line-start run next time to look for headings after the comment $fakeLineStart = true; } else { diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php index 654a66e797..333b70d786 100644 --- a/includes/parser/Preprocessor_Hash.php +++ b/includes/parser/Preprocessor_Hash.php @@ -287,9 +287,11 @@ class Preprocessor_Hash implements Preprocessor { } // Handle comments if ( isset( $matches[2] ) && $matches[2] == '!--' ) { - // To avoid leaving blank lines, when a comment is both preceded - // and followed by a newline (ignoring spaces), trim leading and - // trailing spaces and one of the newlines. + + // To avoid leaving blank lines, when a sequence of + // space-separated comments is both preceded and followed by + // a newline (ignoring spaces), then + // trim leading and trailing spaces and the trailing newline. // Find the end $endPos = strpos( $text, '-->', $i + 4 ); @@ -301,9 +303,24 @@ class Preprocessor_Hash implements Preprocessor { } else { // Search backwards for leading whitespace $wsStart = $i ? ( $i - strspn( $revText, ' ', $lengthText - $i ) ) : 0; + // Search forwards for trailing whitespace // $wsEnd will be the position of the last space (or the '>' if there's none) $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 ); + + // Keep looking forward as long as we're finding more + // comments. + $comments = array( array( $wsStart, $wsEnd ) ); + while ( substr( $text, $wsEnd + 1, 4 ) == '', $wsEnd + 4 ); + if ( $c === false ) { + break; + } + $c = $c + 2 + strspn( $text, ' ', $c + 3 ); + $comments[] = array( $wsEnd + 1, $c ); + $wsEnd = $c; + } + // Eat the line if possible // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but @@ -311,8 +328,6 @@ class Preprocessor_Hash implements Preprocessor { if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n" && substr( $text, $wsEnd + 1, 1 ) == "\n" ) { - $startPos = $wsStart; - $endPos = $wsEnd + 1; // Remove leading whitespace from the end of the accumulator // Sanity check first though $wsLength = $i - $wsStart; @@ -322,6 +337,18 @@ class Preprocessor_Hash implements Preprocessor { { $accum->lastNode->value = substr( $accum->lastNode->value, 0, -$wsLength ); } + + // Dump all but the last comment to the accumulator + foreach ( $comments as $j => $com ) { + $startPos = $com[0]; + $endPos = $com[1] + 1; + if ( $j == ( count( $comments ) - 1) ) { + break; + } + $inner = substr( $text, $startPos, $endPos - $startPos); + $accum->addNodeWithText( 'comment', $inner ); + } + // Do a line-start run next time to look for headings after the comment $fakeLineStart = true; } else { diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index cdbd7e05a9..128e25d155 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -190,7 +190,11 @@ a b ---- a - + +b +---- +a + b ---- a @@ -222,7 +226,11 @@ b


a -

b +b +

+
+

a +b


a @@ -282,7 +290,7 @@ a b ---- a - + b ---- a @@ -305,7 +313,7 @@ b


a -

b +b


a @@ -5424,21 +5432,20 @@ Multiple list tags generated by templates !!end !!test -Single-comment whitespace lines dont break lists, but multi-comment whitespace lines do +Single-comment whitespace lines dont break lists, and so do multi-comment whitespace lines !!input *a *b *c - + *d !!result

  • a
  • b
  • c -
-
  • d +
  • d
!!end -- 2.20.1