From f089e20bc00f6f6e6cc15b4fd8f81bbe9c3dc102 Mon Sep 17 00:00:00 2001
From: "C. Scott Ananian" <cscott@cscott.net>
Date: Tue, 6 Aug 2013 20:21:00 -0400
Subject: [PATCH] Preprocessor: Don't treat a line containing multiple comments
 as a blank line.

After this patch, 'a', 'b', and 'c' are all treated as members of the
same list in the following wikitext:

*a
 <!--x-->
*b
 <!--x--> <!--y-->
*c

The old comment-removal rule was "trim a comment which is both
preceded and followed by a newline (ignoring spaces)".  This only works
if there is a single comment on the line, and was often surprising
to users.  The new rule allows any number of whitespace-separated
comments on the line.

Bug: 41756
Change-Id: I6030086226e1eeece59643c29dbb4361668b4bd6
---
 RELEASE-NOTES-1.22                    |  1 +
 includes/parser/Preprocessor_DOM.php  | 37 +++++++++++++++++++++++----
 includes/parser/Preprocessor_Hash.php | 37 +++++++++++++++++++++++----
 tests/parser/parserTests.txt          | 23 +++++++++++------
 4 files changed, 80 insertions(+), 18 deletions(-)
diff --git a/RELEASE-NOTES-1.22 b/RELEASE-NOTES-1.22
index 5637eb0b16..af467a0b90 100644
--- a/RELEASE-NOTES-1.22
+++ b/RELEASE-NOTES-1.22
@@ -252,6 +252,7 @@ production.
 * (bug 49694) $wgSpamRegex is now also applied on the new section headline text
   adding a new topic on a page
 * (bug 6200) line breaks in <blockquote> are handled like they are in <div>
+* (bug 41756) Improve treatment of multiple comments on a blank line.
 
 === API changes in 1.22 ===
 * (bug 25553) The JSON output formatter now leaves forward slashes unescaped
diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php
index 809e7f7000..c9e16b3559 100644
--- a/includes/parser/Preprocessor_DOM.php
+++ b/includes/parser/Preprocessor_DOM.php
@@ -361,9 +361,11 @@ class Preprocessor_DOM implements Preprocessor {
 				}
 				// Handle comments
 				if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
-					// To avoid leaving blank lines, when a comment is both preceded
-					// and followed by a newline (ignoring spaces), trim leading and
-					// trailing spaces and one of the newlines.
+
+					// To avoid leaving blank lines, when a sequence of
+					// space-separated comments is both preceded and followed by
+					// a newline (ignoring spaces), then
+					// trim leading and trailing spaces and the trailing newline.
 
 					// Find the end
 					$endPos = strpos( $text, '-->', $i + 4 );
@@ -375,9 +377,24 @@ class Preprocessor_DOM implements Preprocessor {
 					} else {
 						// Search backwards for leading whitespace
 						$wsStart = $i ? ( $i - strspn( $revText, ' ', $lengthText - $i ) ) : 0;
+
 						// Search forwards for trailing whitespace
 						// $wsEnd will be the position of the last space (or the '>' if there's none)
 						$wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
+
+						// Keep looking forward as long as we're finding more
+						// comments.
+						$comments = array( array( $wsStart, $wsEnd ) );
+						while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) {
+							$c = strpos( $text, '-->', $wsEnd + 4 );
+							if ( $c === false ) {
+								break;
+							}
+							$c = $c + 2 + strspn( $text, ' ', $c + 3 );
+							$comments[] = array( $wsEnd + 1, $c );
+							$wsEnd = $c;
+						}
+
 						// Eat the line if possible
 						// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
 						// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
@@ -385,14 +402,24 @@ class Preprocessor_DOM implements Preprocessor {
 						if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
 							&& substr( $text, $wsEnd + 1, 1 ) == "\n" )
 						{
-							$startPos = $wsStart;
-							$endPos = $wsEnd + 1;
 							// Remove leading whitespace from the end of the accumulator
 							// Sanity check first though
 							$wsLength = $i - $wsStart;
 							if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) {
 								$accum = substr( $accum, 0, -$wsLength );
 							}
+
+							// Dump all but the last comment to the accumulator
+							foreach ( $comments as $j => $com ) {
+								$startPos = $com[0];
+								$endPos = $com[1] + 1;
+								if ( $j == ( count( $comments ) - 1) ) {
+									break;
+								}
+								$inner = substr( $text, $startPos, $endPos - $startPos);
+								$accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
+							}
+
 							// Do a line-start run next time to look for headings after the comment
 							$fakeLineStart = true;
 						} else {
diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php
index 654a66e797..333b70d786 100644
--- a/includes/parser/Preprocessor_Hash.php
+++ b/includes/parser/Preprocessor_Hash.php
@@ -287,9 +287,11 @@ class Preprocessor_Hash implements Preprocessor {
 				}
 				// Handle comments
 				if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
-					// To avoid leaving blank lines, when a comment is both preceded
-					// and followed by a newline (ignoring spaces), trim leading and
-					// trailing spaces and one of the newlines.
+
+					// To avoid leaving blank lines, when a sequence of
+					// space-separated comments is both preceded and followed by
+					// a newline (ignoring spaces), then
+					// trim leading and trailing spaces and the trailing newline.
 
 					// Find the end
 					$endPos = strpos( $text, '-->', $i + 4 );
@@ -301,9 +303,24 @@ class Preprocessor_Hash implements Preprocessor {
 					} else {
 						// Search backwards for leading whitespace
 						$wsStart = $i ? ( $i - strspn( $revText, ' ', $lengthText - $i ) ) : 0;
+
 						// Search forwards for trailing whitespace
 						// $wsEnd will be the position of the last space (or the '>' if there's none)
 						$wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
+
+						// Keep looking forward as long as we're finding more
+						// comments.
+						$comments = array( array( $wsStart, $wsEnd ) );
+						while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) {
+							$c = strpos( $text, '-->', $wsEnd + 4 );
+							if ( $c === false ) {
+								break;
+							}
+							$c = $c + 2 + strspn( $text, ' ', $c + 3 );
+							$comments[] = array( $wsEnd + 1, $c );
+							$wsEnd = $c;
+						}
+
 						// Eat the line if possible
 						// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
 						// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
@@ -311,8 +328,6 @@ class Preprocessor_Hash implements Preprocessor {
 						if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
 							&& substr( $text, $wsEnd + 1, 1 ) == "\n" )
 						{
-							$startPos = $wsStart;
-							$endPos = $wsEnd + 1;
 							// Remove leading whitespace from the end of the accumulator
 							// Sanity check first though
 							$wsLength = $i - $wsStart;
@@ -322,6 +337,18 @@ class Preprocessor_Hash implements Preprocessor {
 							{
 								$accum->lastNode->value = substr( $accum->lastNode->value, 0, -$wsLength );
 							}
+
+							// Dump all but the last comment to the accumulator
+							foreach ( $comments as $j => $com ) {
+								$startPos = $com[0];
+								$endPos = $com[1] + 1;
+								if ( $j == ( count( $comments ) - 1) ) {
+									break;
+								}
+								$inner = substr( $text, $startPos, $endPos - $startPos);
+								$accum->addNodeWithText( 'comment', $inner );
+							}
+
 							// Do a line-start run next time to look for headings after the comment
 							$fakeLineStart = true;
 						} else {
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index cdbd7e05a9..128e25d155 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -190,7 +190,11 @@ a
 b
 ----
 a
-<!--foo--><!--More than 1 comment disables stripping of this line!-->
+<!--foo--><!--More than 1 comment, still stripped-->
+b
+----
+a
+ <!--foo--> <!----> <!-- bar --> 
 b
 ----
 a
@@ -222,7 +226,11 @@ b
 </p>
 <hr />
 <p>a
-</p><p>b
+b
+</p>
+<hr />
+<p>a
+b
 </p>
 <hr />
 <p>a
@@ -282,7 +290,7 @@ a
 b
 ----
 a
- <!--foo--><!--More than 1 comment disables stripping of this line!-->
+ <!--foo--><!--More than 1 comment doesn't disable stripping of this line!-->
 b
 ----
 a
@@ -305,7 +313,7 @@ b
 </p>
 <hr />
 <p>a
-</p><p>b
+b
 </p>
 <hr />
 <p>a
@@ -5424,21 +5432,20 @@ Multiple list tags generated by templates
 !!end
 
 !!test
-Single-comment whitespace lines dont break lists, but multi-comment whitespace lines do
+Single-comment whitespace lines dont break lists, and so do multi-comment whitespace lines
 !!input
 *a
 <!--This line will NOT split the list-->
 *b
  <!--This line will NOT split the list either-->
 *c
- <!--foo--> <!--This line with more than 1 comment will split the list-->
+ <!--foo--> <!----> <!--This line NOT split the list either--> 
 *d
 !!result
 <ul><li>a
 </li><li>b
 </li><li>c
-</li></ul>
-<ul><li>d
+</li><li>d
 </li></ul>
 
 !!end
-- 
2.20.1