Preprocessor: Don't treat a line containing multiple comments as a blank line.
authorC. Scott Ananian <cscott@cscott.net>
Wed, 7 Aug 2013 00:21:00 +0000 (20:21 -0400)
committerTim Starling <tstarling@wikimedia.org>
Fri, 9 Aug 2013 00:28:28 +0000 (00:28 +0000)
After this patch, 'a', 'b', and 'c' are all treated as members of the
same list in the following wikitext:

*a
 <!--x-->
*b
 <!--x--> <!--y-->
*c

The old comment-removal rule was "trim a comment which is both
preceded and followed by a newline (ignoring spaces)".  This only works
if there is a single comment on the line, and was often surprising
to users.  The new rule allows any number of whitespace-separated
comments on the line.

Bug: 41756
Change-Id: I6030086226e1eeece59643c29dbb4361668b4bd6

RELEASE-NOTES-1.22
includes/parser/Preprocessor_DOM.php
includes/parser/Preprocessor_Hash.php
tests/parser/parserTests.txt

index 5637eb0..af467a0 100644 (file)
@@ -252,6 +252,7 @@ production.
 * (bug 49694) $wgSpamRegex is now also applied on the new section headline text
   adding a new topic on a page
 * (bug 6200) line breaks in <blockquote> are handled like they are in <div>
+* (bug 41756) Improve treatment of multiple comments on a blank line.
 
 === API changes in 1.22 ===
 * (bug 25553) The JSON output formatter now leaves forward slashes unescaped
index 809e7f7..c9e16b3 100644 (file)
@@ -361,9 +361,11 @@ class Preprocessor_DOM implements Preprocessor {
                                }
                                // Handle comments
                                if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
-                                       // To avoid leaving blank lines, when a comment is both preceded
-                                       // and followed by a newline (ignoring spaces), trim leading and
-                                       // trailing spaces and one of the newlines.
+
+                                       // To avoid leaving blank lines, when a sequence of
+                                       // space-separated comments is both preceded and followed by
+                                       // a newline (ignoring spaces), then
+                                       // trim leading and trailing spaces and the trailing newline.
 
                                        // Find the end
                                        $endPos = strpos( $text, '-->', $i + 4 );
@@ -375,9 +377,24 @@ class Preprocessor_DOM implements Preprocessor {
                                        } else {
                                                // Search backwards for leading whitespace
                                                $wsStart = $i ? ( $i - strspn( $revText, ' ', $lengthText - $i ) ) : 0;
+
                                                // Search forwards for trailing whitespace
                                                // $wsEnd will be the position of the last space (or the '>' if there's none)
                                                $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
+
+                                               // Keep looking forward as long as we're finding more
+                                               // comments.
+                                               $comments = array( array( $wsStart, $wsEnd ) );
+                                               while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) {
+                                                       $c = strpos( $text, '-->', $wsEnd + 4 );
+                                                       if ( $c === false ) {
+                                                               break;
+                                                       }
+                                                       $c = $c + 2 + strspn( $text, ' ', $c + 3 );
+                                                       $comments[] = array( $wsEnd + 1, $c );
+                                                       $wsEnd = $c;
+                                               }
+
                                                // Eat the line if possible
                                                // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
                                                // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
@@ -385,14 +402,24 @@ class Preprocessor_DOM implements Preprocessor {
                                                if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
                                                        && substr( $text, $wsEnd + 1, 1 ) == "\n" )
                                                {
-                                                       $startPos = $wsStart;
-                                                       $endPos = $wsEnd + 1;
                                                        // Remove leading whitespace from the end of the accumulator
                                                        // Sanity check first though
                                                        $wsLength = $i - $wsStart;
                                                        if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) {
                                                                $accum = substr( $accum, 0, -$wsLength );
                                                        }
+
+                                                       // Dump all but the last comment to the accumulator
+                                                       foreach ( $comments as $j => $com ) {
+                                                               $startPos = $com[0];
+                                                               $endPos = $com[1] + 1;
+                                                               if ( $j == ( count( $comments ) - 1) ) {
+                                                                       break;
+                                                               }
+                                                               $inner = substr( $text, $startPos, $endPos - $startPos);
+                                                               $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
+                                                       }
+
                                                        // Do a line-start run next time to look for headings after the comment
                                                        $fakeLineStart = true;
                                                } else {
index 654a66e..333b70d 100644 (file)
@@ -287,9 +287,11 @@ class Preprocessor_Hash implements Preprocessor {
                                }
                                // Handle comments
                                if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
-                                       // To avoid leaving blank lines, when a comment is both preceded
-                                       // and followed by a newline (ignoring spaces), trim leading and
-                                       // trailing spaces and one of the newlines.
+
+                                       // To avoid leaving blank lines, when a sequence of
+                                       // space-separated comments is both preceded and followed by
+                                       // a newline (ignoring spaces), then
+                                       // trim leading and trailing spaces and the trailing newline.
 
                                        // Find the end
                                        $endPos = strpos( $text, '-->', $i + 4 );
@@ -301,9 +303,24 @@ class Preprocessor_Hash implements Preprocessor {
                                        } else {
                                                // Search backwards for leading whitespace
                                                $wsStart = $i ? ( $i - strspn( $revText, ' ', $lengthText - $i ) ) : 0;
+
                                                // Search forwards for trailing whitespace
                                                // $wsEnd will be the position of the last space (or the '>' if there's none)
                                                $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
+
+                                               // Keep looking forward as long as we're finding more
+                                               // comments.
+                                               $comments = array( array( $wsStart, $wsEnd ) );
+                                               while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) {
+                                                       $c = strpos( $text, '-->', $wsEnd + 4 );
+                                                       if ( $c === false ) {
+                                                               break;
+                                                       }
+                                                       $c = $c + 2 + strspn( $text, ' ', $c + 3 );
+                                                       $comments[] = array( $wsEnd + 1, $c );
+                                                       $wsEnd = $c;
+                                               }
+
                                                // Eat the line if possible
                                                // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
                                                // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
@@ -311,8 +328,6 @@ class Preprocessor_Hash implements Preprocessor {
                                                if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
                                                        && substr( $text, $wsEnd + 1, 1 ) == "\n" )
                                                {
-                                                       $startPos = $wsStart;
-                                                       $endPos = $wsEnd + 1;
                                                        // Remove leading whitespace from the end of the accumulator
                                                        // Sanity check first though
                                                        $wsLength = $i - $wsStart;
@@ -322,6 +337,18 @@ class Preprocessor_Hash implements Preprocessor {
                                                        {
                                                                $accum->lastNode->value = substr( $accum->lastNode->value, 0, -$wsLength );
                                                        }
+
+                                                       // Dump all but the last comment to the accumulator
+                                                       foreach ( $comments as $j => $com ) {
+                                                               $startPos = $com[0];
+                                                               $endPos = $com[1] + 1;
+                                                               if ( $j == ( count( $comments ) - 1) ) {
+                                                                       break;
+                                                               }
+                                                               $inner = substr( $text, $startPos, $endPos - $startPos);
+                                                               $accum->addNodeWithText( 'comment', $inner );
+                                                       }
+
                                                        // Do a line-start run next time to look for headings after the comment
                                                        $fakeLineStart = true;
                                                } else {
index cdbd7e0..128e25d 100644 (file)
@@ -190,7 +190,11 @@ a
 b
 ----
 a
-<!--foo--><!--More than 1 comment disables stripping of this line!-->
+<!--foo--><!--More than 1 comment, still stripped-->
+b
+----
+a
+ <!--foo--> <!----> <!-- bar --> 
 b
 ----
 a
@@ -222,7 +226,11 @@ b
 </p>
 <hr />
 <p>a
-</p><p>b
+b
+</p>
+<hr />
+<p>a
+b
 </p>
 <hr />
 <p>a
@@ -282,7 +290,7 @@ a
 b
 ----
 a
- <!--foo--><!--More than 1 comment disables stripping of this line!-->
+ <!--foo--><!--More than 1 comment doesn't disable stripping of this line!-->
 b
 ----
 a
@@ -305,7 +313,7 @@ b
 </p>
 <hr />
 <p>a
-</p><p>b
+b
 </p>
 <hr />
 <p>a
@@ -5424,21 +5432,20 @@ Multiple list tags generated by templates
 !!end
 
 !!test
-Single-comment whitespace lines dont break lists, but multi-comment whitespace lines do
+Single-comment whitespace lines dont break lists, and so do multi-comment whitespace lines
 !!input
 *a
 <!--This line will NOT split the list-->
 *b
  <!--This line will NOT split the list either-->
 *c
- <!--foo--> <!--This line with more than 1 comment will split the list-->
+ <!--foo--> <!----> <!--This line NOT split the list either--> 
 *d
 !!result
 <ul><li>a
 </li><li>b
 </li><li>c
-</li></ul>
-<ul><li>d
+</li><li>d
 </li></ul>
 
 !!end