Preprocessor: Don't treat a line containing multiple comments as a blank line.

author C. Scott Ananian <cscott@cscott.net>

Wed, 7 Aug 2013 00:21:00 +0000 (20:21 -0400)

committer Tim Starling <tstarling@wikimedia.org>

Fri, 9 Aug 2013 00:28:28 +0000 (00:28 +0000)
author C. Scott Ananian <cscott@cscott.net>
Wed, 7 Aug 2013 00:21:00 +0000 (20:21 -0400)
committer Tim Starling <tstarling@wikimedia.org>
Fri, 9 Aug 2013 00:28:28 +0000 (00:28 +0000)
diff --git a/RELEASE-NOTES-1.22 b/RELEASE-NOTES-1.22

index 5637eb0..af467a0 100644 (file)
--- a/RELEASE-NOTES-1.22
+++ b/RELEASE-NOTES-1.22
@@ -252,6 +252,7 @@ production.
  * (bug 49694) $wgSpamRegex is now also applied on the new section headline text
    adding a new topic on a page
  * (bug 6200) line breaks in <blockquote> are handled like they are in <div>
+* (bug 41756) Improve treatment of multiple comments on a blank line.
  
  === API changes in 1.22 ===
  * (bug 25553) The JSON output formatter now leaves forward slashes unescaped
diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php

index 809e7f7..c9e16b3 100644 (file)
--- a/includes/parser/Preprocessor_DOM.php
+++ b/includes/parser/Preprocessor_DOM.php
@@ -361,9 +361,11 @@ class Preprocessor_DOM implements Preprocessor {
                                 }
                                 // Handle comments
                                 if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
-                                       // To avoid leaving blank lines, when a comment is both preceded
-                                       // and followed by a newline (ignoring spaces), trim leading and
-                                       // trailing spaces and one of the newlines.
+
+                                       // To avoid leaving blank lines, when a sequence of
+                                       // space-separated comments is both preceded and followed by
+                                       // a newline (ignoring spaces), then
+                                       // trim leading and trailing spaces and the trailing newline.
  
                                         // Find the end
                                         $endPos = strpos( $text, '-->', $i + 4 );
@@ -375,9 +377,24 @@ class Preprocessor_DOM implements Preprocessor {
                                         } else {
                                                 // Search backwards for leading whitespace
                                                 $wsStart = $i ? ( $i - strspn( $revText, ' ', $lengthText - $i ) ) : 0;
+
                                                 // Search forwards for trailing whitespace
                                                 // $wsEnd will be the position of the last space (or the '>' if there's none)
                                                 $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
+
+                                               // Keep looking forward as long as we're finding more
+                                               // comments.
+                                               $comments = array( array( $wsStart, $wsEnd ) );
+                                               while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) {
+                                                       $c = strpos( $text, '-->', $wsEnd + 4 );
+                                                       if ( $c === false ) {
+                                                               break;
+                                                       }
+                                                       $c = $c + 2 + strspn( $text, ' ', $c + 3 );
+                                                       $comments[] = array( $wsEnd + 1, $c );
+                                                       $wsEnd = $c;
+                                               }
+
                                                 // Eat the line if possible
                                                 // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
                                                 // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
@@ -385,14 +402,24 @@ class Preprocessor_DOM implements Preprocessor {
                                                 if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
                                                         && substr( $text, $wsEnd + 1, 1 ) == "\n" )
                                                 {
-                                                       $startPos = $wsStart;
-                                                       $endPos = $wsEnd + 1;
                                                         // Remove leading whitespace from the end of the accumulator
                                                         // Sanity check first though
                                                         $wsLength = $i - $wsStart;
                                                         if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) {
                                                                 $accum = substr( $accum, 0, -$wsLength );
                                                         }
+
+                                                       // Dump all but the last comment to the accumulator
+                                                       foreach ( $comments as $j => $com ) {
+                                                               $startPos = $com[0];
+                                                               $endPos = $com[1] + 1;
+                                                               if ( $j == ( count( $comments ) - 1) ) {
+                                                                       break;
+                                                               }
+                                                               $inner = substr( $text, $startPos, $endPos - $startPos);
+                                                               $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
+                                                       }
+
                                                         // Do a line-start run next time to look for headings after the comment
                                                         $fakeLineStart = true;
                                                 } else {
diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php

index 654a66e..333b70d 100644 (file)
--- a/includes/parser/Preprocessor_Hash.php
+++ b/includes/parser/Preprocessor_Hash.php
@@ -287,9 +287,11 @@ class Preprocessor_Hash implements Preprocessor {
                                 }
                                 // Handle comments
                                 if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
-                                       // To avoid leaving blank lines, when a comment is both preceded
-                                       // and followed by a newline (ignoring spaces), trim leading and
-                                       // trailing spaces and one of the newlines.
+
+                                       // To avoid leaving blank lines, when a sequence of
+                                       // space-separated comments is both preceded and followed by
+                                       // a newline (ignoring spaces), then
+                                       // trim leading and trailing spaces and the trailing newline.
  
                                         // Find the end
                                         $endPos = strpos( $text, '-->', $i + 4 );
@@ -301,9 +303,24 @@ class Preprocessor_Hash implements Preprocessor {
                                         } else {
                                                 // Search backwards for leading whitespace
                                                 $wsStart = $i ? ( $i - strspn( $revText, ' ', $lengthText - $i ) ) : 0;
+
                                                 // Search forwards for trailing whitespace
                                                 // $wsEnd will be the position of the last space (or the '>' if there's none)
                                                 $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
+
+                                               // Keep looking forward as long as we're finding more
+                                               // comments.
+                                               $comments = array( array( $wsStart, $wsEnd ) );
+                                               while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) {
+                                                       $c = strpos( $text, '-->', $wsEnd + 4 );
+                                                       if ( $c === false ) {
+                                                               break;
+                                                       }
+                                                       $c = $c + 2 + strspn( $text, ' ', $c + 3 );
+                                                       $comments[] = array( $wsEnd + 1, $c );
+                                                       $wsEnd = $c;
+                                               }
+
                                                 // Eat the line if possible
                                                 // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
                                                 // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
@@ -311,8 +328,6 @@ class Preprocessor_Hash implements Preprocessor {
                                                 if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
                                                         && substr( $text, $wsEnd + 1, 1 ) == "\n" )
                                                 {
-                                                       $startPos = $wsStart;
-                                                       $endPos = $wsEnd + 1;
                                                         // Remove leading whitespace from the end of the accumulator
                                                         // Sanity check first though
                                                         $wsLength = $i - $wsStart;
@@ -322,6 +337,18 @@ class Preprocessor_Hash implements Preprocessor {
                                                         {
                                                                 $accum->lastNode->value = substr( $accum->lastNode->value, 0, -$wsLength );
                                                         }
+
+                                                       // Dump all but the last comment to the accumulator
+                                                       foreach ( $comments as $j => $com ) {
+                                                               $startPos = $com[0];
+                                                               $endPos = $com[1] + 1;
+                                                               if ( $j == ( count( $comments ) - 1) ) {
+                                                                       break;
+                                                               }
+                                                               $inner = substr( $text, $startPos, $endPos - $startPos);
+                                                               $accum->addNodeWithText( 'comment', $inner );
+                                                       }
+
                                                         // Do a line-start run next time to look for headings after the comment
                                                         $fakeLineStart = true;
                                                 } else {
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt

index cdbd7e0..128e25d 100644 (file)
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -190,7 +190,11 @@ a
  b
  ----
  a
-<!--foo--><!--More than 1 comment disables stripping of this line!-->
+<!--foo--><!--More than 1 comment, still stripped-->
+b
+----
+a
+ <!--foo--> <!----> <!-- bar --> 
  b
  ----
  a
@@ -222,7 +226,11 @@ b
  </p>
  <hr />
  <p>a
-</p><p>b
+b
+</p>
+<hr />
+<p>a
+b
  </p>
  <hr />
  <p>a
@@ -282,7 +290,7 @@ a
  b
  ----
  a
- <!--foo--><!--More than 1 comment disables stripping of this line!-->
+ <!--foo--><!--More than 1 comment doesn't disable stripping of this line!-->
  b
  ----
  a
@@ -305,7 +313,7 @@ b
  </p>
  <hr />
  <p>a
-</p><p>b
+b
  </p>
  <hr />
  <p>a
@@ -5424,21 +5432,20 @@ Multiple list tags generated by templates
  !!end
  
  !!test
-Single-comment whitespace lines dont break lists, but multi-comment whitespace lines do
+Single-comment whitespace lines dont break lists, and so do multi-comment whitespace lines
  !!input
  *a
  <!--This line will NOT split the list-->
  *b
   <!--This line will NOT split the list either-->
  *c
- <!--foo--> <!--This line with more than 1 comment will split the list-->
+ <!--foo--> <!----> <!--This line NOT split the list either--> 
  *d
  !!result
  <ul><li>a
  </li><li>b
  </li><li>c
-</li></ul>
-<ul><li>d
+</li><li>d
  </li></ul>
  
  !!end
author	C. Scott Ananian <cscott@cscott.net>
	Wed, 7 Aug 2013 00:21:00 +0000 (20:21 -0400)
committer	Tim Starling <tstarling@wikimedia.org>
	Fri, 9 Aug 2013 00:28:28 +0000 (00:28 +0000)
RELEASE-NOTES-1.22		patch \| blob \| history
includes/parser/Preprocessor_DOM.php		patch \| blob \| history
includes/parser/Preprocessor_Hash.php		patch \| blob \| history
tests/parser/parserTests.txt		patch \| blob \| history