Preserve whitespace in search index text content
authorErik Bernhardson <ebernhardson@wikimedia.org>
Mon, 10 Sep 2018 23:39:09 +0000 (16:39 -0700)
committerErik Bernhardson <ebernhardson@wikimedia.org>
Fri, 14 Sep 2018 18:10:35 +0000 (11:10 -0700)
Certain html tags imply a word break, but our html stripping doesn't
understand that at all. Adjust the html stripping to inject whitespace
for all block level tags (per MDN) along with the <br> element.

Bug: T195389
Change-Id: I9fbfac765ea88628e4f9b2794fb54e1cd0060203

includes/content/WikiTextStructure.php
includes/parser/RemexStripTagHandler.php
includes/parser/Sanitizer.php
tests/phpunit/includes/content/WikitextStructureTest.php
tests/phpunit/includes/parser/SanitizerTest.php

index 1128d7b..a82ffa1 100644 (file)
@@ -161,10 +161,6 @@ class WikiTextStructure {
 
                $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
 
-               // Add extra spacing around break tags so text crammed together like<br>this
-               // doesn't make one word.
-               $text = str_replace( '<br', "\n<br", $text );
-
                $formatter = new HtmlFormatter( $text );
 
                // Strip elements from the page that we never want in the search text.
index 2839147..41c6bf4 100644 (file)
@@ -26,10 +26,18 @@ class RemexStripTagHandler implements TokenHandler {
                $this->text .= substr( $text, $start, $length );
        }
        function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
-               // Do nothing.
+               // Inject whitespace for typical block-level tags to
+               // prevent merging unrelated<br>words.
+               if ( $this->isBlockLevelTag( $name ) ) {
+                       $this->text .= ' ';
+               }
        }
        function endTag( $name, $sourceStart, $sourceLength ) {
-               // Do nothing.
+               // Inject whitespace for typical block-level tags to
+               // prevent merging unrelated<br>words.
+               if ( $this->isBlockLevelTag( $name ) ) {
+                       $this->text .= ' ';
+               }
        }
        function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
                // Do nothing.
@@ -37,4 +45,63 @@ class RemexStripTagHandler implements TokenHandler {
        function comment( $text, $sourceStart, $sourceLength ) {
                // Do nothing.
        }
+
+       // Per https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
+       // retrieved on sept 12, 2018. <br> is not block level but was added anyways.
+       // The following is a complete list of all HTML block level elements
+       // (although "block-level" is not technically defined for elements that are
+       // new in HTML5).
+       // Structured as tag => true to allow O(1) membership test.
+       static private $BLOCK_LEVEL_TAGS = [
+               'address' => true,
+               'article' => true,
+               'aside' => true,
+               'blockquote' => true,
+               'br' => true,
+               'canvas' => true,
+               'dd' => true,
+               'div' => true,
+               'dl' => true,
+               'dt' => true,
+               'fieldset' => true,
+               'figcaption' => true,
+               'figure' => true,
+               'figcaption' => true,
+               'footer' => true,
+               'form' => true,
+               'h1' => true,
+               'h2' => true,
+               'h3' => true,
+               'h4' => true,
+               'h5' => true,
+               'h6' => true,
+               'header' => true,
+               'hgroup' => true,
+               'hr' => true,
+               'li' => true,
+               'main' => true,
+               'nav' => true,
+               'noscript' => true,
+               'ol' => true,
+               'output' => true,
+               'p' => true,
+               'pre' => true,
+               'section' => true,
+               'table' => true,
+               'tfoot' => true,
+               'ul' => true,
+               'video' => true,
+       ];
+
+       /**
+        * Detect block level tags. Of course css can make anything a block
+        * level tag, but this is still better than nothing.
+        *
+        * @param string $tagName HTML tag name
+        * @return bool True when tag is an html block level element
+        */
+       private function isBlockLevelTag( $tagName ) {
+               $key = strtolower( trim( $tagName ) );
+               return isset( self::$BLOCK_LEVEL_TAGS[$key] );
+       }
 }
index d885e24..85c71ee 100644 (file)
@@ -1508,10 +1508,10 @@ class Sanitizer {
         * @return string
         */
        private static function normalizeWhitespace( $text ) {
-               return preg_replace(
-                       '/\r\n|[\x20\x0d\x0a\x09]/',
+               return trim( preg_replace(
+                       '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
                        ' ',
-                       $text );
+                       $text ) );
        }
 
        /**
index 88f4d8f..607549f 100644 (file)
@@ -102,9 +102,15 @@ Then we got more<br>text
 END;
                $struct = $this->getStructure( $text );
                $this->assertEquals( "Opening text is opening.", $struct->getOpeningText() );
-               $this->assertEquals( "Opening text is opening.   Then we got more text",
+               $this->assertEquals( "Opening text is opening. Then we got more text",
                        $struct->getMainText() );
                $this->assertEquals( [ "Header table row in table another row in table" ],
                        $struct->getAuxiliaryText() );
        }
+
+       public function testPreservesWordSpacing() {
+               $text = "<dd><dl>foo</dl><dl>bar</dl></dd><p>baz</p>";
+               $struct = $this->getStructure( $text );
+               $this->assertEquals( "foo bar baz", $struct->getMainText() );
+       }
 }
index b5965c4..a8b0f90 100644 (file)
@@ -514,7 +514,7 @@ class SanitizerTest extends MediaWikiTestCase {
        public function provideStripAllTags() {
                return [
                        [ '<p>Foo</p>', 'Foo' ],
-                       [ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
+                       [ '<p id="one">Foo</p><p id="two">Bar</p>', 'Foo Bar' ],
                        [ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
                        [ '<p>Hello &lt;strong&gt; wor&#x6c;&#100; caf&eacute;</p>', 'Hello <strong> world cafĂ©' ],
                        [