From 0d779c1ac6ee790663e4a97027d0f044522dfaa2 Mon Sep 17 00:00:00 2001 From: Erik Bernhardson Date: Mon, 10 Sep 2018 16:39:09 -0700 Subject: [PATCH] Preserve whitespace in search index text content Certain html tags imply a word break, but our html stripping doesn't understand that at all. Adjust the html stripping to inject whitespace for all block level tags (per MDN) along with the
element. Bug: T195389 Change-Id: I9fbfac765ea88628e4f9b2794fb54e1cd0060203 --- includes/content/WikiTextStructure.php | 4 -- includes/parser/RemexStripTagHandler.php | 71 ++++++++++++++++++- includes/parser/Sanitizer.php | 6 +- .../content/WikitextStructureTest.php | 8 ++- .../phpunit/includes/parser/SanitizerTest.php | 2 +- 5 files changed, 80 insertions(+), 11 deletions(-) diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php index 1128d7bd36..a82ffa1b5a 100644 --- a/includes/content/WikiTextStructure.php +++ b/includes/content/WikiTextStructure.php @@ -161,10 +161,6 @@ class WikiTextStructure { $this->openingText = $this->extractHeadingBeforeFirstHeading( $text ); - // Add extra spacing around break tags so text crammed together like
this - // doesn't make one word. - $text = str_replace( 'text .= substr( $text, $start, $length ); } function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) { - // Do nothing. + // Inject whitespace for typical block-level tags to + // prevent merging unrelated
words. + if ( $this->isBlockLevelTag( $name ) ) { + $this->text .= ' '; + } } function endTag( $name, $sourceStart, $sourceLength ) { - // Do nothing. + // Inject whitespace for typical block-level tags to + // prevent merging unrelated
words. + if ( $this->isBlockLevelTag( $name ) ) { + $this->text .= ' '; + } } function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { // Do nothing. @@ -37,4 +45,63 @@ class RemexStripTagHandler implements TokenHandler { function comment( $text, $sourceStart, $sourceLength ) { // Do nothing. } + + // Per https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements + // retrieved on sept 12, 2018.
is not block level but was added anyways. + // The following is a complete list of all HTML block level elements + // (although "block-level" is not technically defined for elements that are + // new in HTML5). + // Structured as tag => true to allow O(1) membership test. + static private $BLOCK_LEVEL_TAGS = [ + 'address' => true, + 'article' => true, + 'aside' => true, + 'blockquote' => true, + 'br' => true, + 'canvas' => true, + 'dd' => true, + 'div' => true, + 'dl' => true, + 'dt' => true, + 'fieldset' => true, + 'figcaption' => true, + 'figure' => true, + 'figcaption' => true, + 'footer' => true, + 'form' => true, + 'h1' => true, + 'h2' => true, + 'h3' => true, + 'h4' => true, + 'h5' => true, + 'h6' => true, + 'header' => true, + 'hgroup' => true, + 'hr' => true, + 'li' => true, + 'main' => true, + 'nav' => true, + 'noscript' => true, + 'ol' => true, + 'output' => true, + 'p' => true, + 'pre' => true, + 'section' => true, + 'table' => true, + 'tfoot' => true, + 'ul' => true, + 'video' => true, + ]; + + /** + * Detect block level tags. Of course css can make anything a block + * level tag, but this is still better than nothing. + * + * @param string $tagName HTML tag name + * @return bool True when tag is an html block level element + */ + private function isBlockLevelTag( $tagName ) { + $key = strtolower( trim( $tagName ) ); + return isset( self::$BLOCK_LEVEL_TAGS[$key] ); + } } diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php index d885e24c25..85c71eeb44 100644 --- a/includes/parser/Sanitizer.php +++ b/includes/parser/Sanitizer.php @@ -1508,10 +1508,10 @@ class Sanitizer { * @return string */ private static function normalizeWhitespace( $text ) { - return preg_replace( - '/\r\n|[\x20\x0d\x0a\x09]/', + return trim( preg_replace( + '/(?:\r\n|[\x20\x0d\x0a\x09])+/', ' ', - $text ); + $text ) ); } /** diff --git a/tests/phpunit/includes/content/WikitextStructureTest.php b/tests/phpunit/includes/content/WikitextStructureTest.php index 88f4d8f74c..607549f9da 100644 --- a/tests/phpunit/includes/content/WikitextStructureTest.php +++ b/tests/phpunit/includes/content/WikitextStructureTest.php @@ -102,9 +102,15 @@ Then we got more
text END; $struct = $this->getStructure( $text ); $this->assertEquals( "Opening text is opening.", $struct->getOpeningText() ); - $this->assertEquals( "Opening text is opening. Then we got more text", + $this->assertEquals( "Opening text is opening. Then we got more text", $struct->getMainText() ); $this->assertEquals( [ "Header table row in table another row in table" ], $struct->getAuxiliaryText() ); } + + public function testPreservesWordSpacing() { + $text = "
foo
bar

baz

"; + $struct = $this->getStructure( $text ); + $this->assertEquals( "foo bar baz", $struct->getMainText() ); + } } diff --git a/tests/phpunit/includes/parser/SanitizerTest.php b/tests/phpunit/includes/parser/SanitizerTest.php index b5965c480c..a8b0f90a3a 100644 --- a/tests/phpunit/includes/parser/SanitizerTest.php +++ b/tests/phpunit/includes/parser/SanitizerTest.php @@ -514,7 +514,7 @@ class SanitizerTest extends MediaWikiTestCase { public function provideStripAllTags() { return [ [ '

Foo

', 'Foo' ], - [ '

Foo

Bar

', 'FooBar' ], + [ '

Foo

Bar

', 'Foo Bar' ], [ "

Foo

\n

Bar

", 'Foo Bar' ], [ '

Hello <strong> world café

', 'Hello world café' ], [ -- 2.20.1