Preserve whitespace in search index text content

author Erik Bernhardson <ebernhardson@wikimedia.org>

Mon, 10 Sep 2018 23:39:09 +0000 (16:39 -0700)

committer Erik Bernhardson <ebernhardson@wikimedia.org>

Fri, 14 Sep 2018 18:10:35 +0000 (11:10 -0700)
author Erik Bernhardson <ebernhardson@wikimedia.org>
Mon, 10 Sep 2018 23:39:09 +0000 (16:39 -0700)
committer Erik Bernhardson <ebernhardson@wikimedia.org>
Fri, 14 Sep 2018 18:10:35 +0000 (11:10 -0700)
diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php

index 1128d7b..a82ffa1 100644 (file)
--- a/includes/content/WikiTextStructure.php
+++ b/includes/content/WikiTextStructure.php
@@ -161,10 +161,6 @@ class WikiTextStructure {
  
                 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
  
-               // Add extra spacing around break tags so text crammed together like<br>this
-               // doesn't make one word.
-               $text = str_replace( '<br', "\n<br", $text );
-
                 $formatter = new HtmlFormatter( $text );
  
                 // Strip elements from the page that we never want in the search text.
diff --git a/includes/parser/RemexStripTagHandler.php b/includes/parser/RemexStripTagHandler.php

index 2839147..41c6bf4 100644 (file)
--- a/includes/parser/RemexStripTagHandler.php
+++ b/includes/parser/RemexStripTagHandler.php
@@ -26,10 +26,18 @@ class RemexStripTagHandler implements TokenHandler {
                 $this->text .= substr( $text, $start, $length );
         }
         function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
-               // Do nothing.
+               // Inject whitespace for typical block-level tags to
+               // prevent merging unrelated<br>words.
+               if ( $this->isBlockLevelTag( $name ) ) {
+                       $this->text .= ' ';
+               }
         }
         function endTag( $name, $sourceStart, $sourceLength ) {
-               // Do nothing.
+               // Inject whitespace for typical block-level tags to
+               // prevent merging unrelated<br>words.
+               if ( $this->isBlockLevelTag( $name ) ) {
+                       $this->text .= ' ';
+               }
         }
         function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
                 // Do nothing.
@@ -37,4 +45,63 @@ class RemexStripTagHandler implements TokenHandler {
         function comment( $text, $sourceStart, $sourceLength ) {
                 // Do nothing.
         }
+
+       // Per https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
+       // retrieved on sept 12, 2018. <br> is not block level but was added anyways.
+       // The following is a complete list of all HTML block level elements
+       // (although "block-level" is not technically defined for elements that are
+       // new in HTML5).
+       // Structured as tag => true to allow O(1) membership test.
+       static private $BLOCK_LEVEL_TAGS = [
+               'address' => true,
+               'article' => true,
+               'aside' => true,
+               'blockquote' => true,
+               'br' => true,
+               'canvas' => true,
+               'dd' => true,
+               'div' => true,
+               'dl' => true,
+               'dt' => true,
+               'fieldset' => true,
+               'figcaption' => true,
+               'figure' => true,
+               'figcaption' => true,
+               'footer' => true,
+               'form' => true,
+               'h1' => true,
+               'h2' => true,
+               'h3' => true,
+               'h4' => true,
+               'h5' => true,
+               'h6' => true,
+               'header' => true,
+               'hgroup' => true,
+               'hr' => true,
+               'li' => true,
+               'main' => true,
+               'nav' => true,
+               'noscript' => true,
+               'ol' => true,
+               'output' => true,
+               'p' => true,
+               'pre' => true,
+               'section' => true,
+               'table' => true,
+               'tfoot' => true,
+               'ul' => true,
+               'video' => true,
+       ];
+
+       /**
+        * Detect block level tags. Of course css can make anything a block
+        * level tag, but this is still better than nothing.
+        *
+        * @param string $tagName HTML tag name
+        * @return bool True when tag is an html block level element
+        */
+       private function isBlockLevelTag( $tagName ) {
+               $key = strtolower( trim( $tagName ) );
+               return isset( self::$BLOCK_LEVEL_TAGS[$key] );
+       }
  }
diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php

index d885e24..85c71ee 100644 (file)
--- a/includes/parser/Sanitizer.php
+++ b/includes/parser/Sanitizer.php
@@ -1508,10 +1508,10 @@ class Sanitizer {
          * @return string
          */
         private static function normalizeWhitespace( $text ) {
-               return preg_replace(
-                       '/\r\n|[\x20\x0d\x0a\x09]/',
+               return trim( preg_replace(
+                       '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
                         ' ',
-                       $text );
+                       $text ) );
         }
  
         /**
diff --git a/tests/phpunit/includes/content/WikitextStructureTest.php b/tests/phpunit/includes/content/WikitextStructureTest.php

index 88f4d8f..607549f 100644 (file)
--- a/tests/phpunit/includes/content/WikitextStructureTest.php
+++ b/tests/phpunit/includes/content/WikitextStructureTest.php
@@ -102,9 +102,15 @@ Then we got more<br>text
  END;
                 $struct = $this->getStructure( $text );
                 $this->assertEquals( "Opening text is opening.", $struct->getOpeningText() );
-               $this->assertEquals( "Opening text is opening.   Then we got more text",
+               $this->assertEquals( "Opening text is opening. Then we got more text",
                         $struct->getMainText() );
                 $this->assertEquals( [ "Header table row in table another row in table" ],
                         $struct->getAuxiliaryText() );
         }
+
+       public function testPreservesWordSpacing() {
+               $text = "<dd><dl>foo</dl><dl>bar</dl></dd><p>baz</p>";
+               $struct = $this->getStructure( $text );
+               $this->assertEquals( "foo bar baz", $struct->getMainText() );
+       }
  }
diff --git a/tests/phpunit/includes/parser/SanitizerTest.php b/tests/phpunit/includes/parser/SanitizerTest.php

index b5965c4..a8b0f90 100644 (file)
--- a/tests/phpunit/includes/parser/SanitizerTest.php
+++ b/tests/phpunit/includes/parser/SanitizerTest.php
@@ -514,7 +514,7 @@ class SanitizerTest extends MediaWikiTestCase {
         public function provideStripAllTags() {
                 return [
                         [ '<p>Foo</p>', 'Foo' ],
-                       [ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
+                       [ '<p id="one">Foo</p><p id="two">Bar</p>', 'Foo Bar' ],
                         [ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
                         [ '<p>Hello &lt;strong&gt; wor&#x6c;&#100; caf&eacute;</p>', 'Hello <strong> world café' ],
                         [
author	Erik Bernhardson <ebernhardson@wikimedia.org>
	Mon, 10 Sep 2018 23:39:09 +0000 (16:39 -0700)
committer	Erik Bernhardson <ebernhardson@wikimedia.org>
	Fri, 14 Sep 2018 18:10:35 +0000 (11:10 -0700)
includes/content/WikiTextStructure.php		patch \| blob \| history
includes/parser/RemexStripTagHandler.php		patch \| blob \| history
includes/parser/Sanitizer.php		patch \| blob \| history
tests/phpunit/includes/content/WikitextStructureTest.php		patch \| blob \| history
tests/phpunit/includes/parser/SanitizerTest.php		patch \| blob \| history