From 187ada1cf404f5943f8962c6241aa53eb3fa139c Mon Sep 17 00:00:00 2001 From: David Causse Date: Thu, 29 Jun 2017 10:29:13 +0200 Subject: [PATCH] Fix phrase search Partially revert I61dc536 that broke phrase search support. Fix phrase search by making explicit that there are two kind of legalSearchChars() usecases : - the chars allowed to be part of the search query (including special syntax chars such as " and *). Used by SearchDatabase::filter() to cleanup the whole query string (the default). - the chars allowed to be part of a search term (excluding special syntax chars) Used by search engine implementaions when parsing with a regex. For future reference: Originally this distinction was made "explicit" by calling directly SearchEngine::legalSearchChars() during the parsing stage. This was broken by Iaabc10c by enabling inheritance. This patch adds a new optional param to legalSearchChars to make this more explicit. Also remove the function I introduced in I61dc536 (I wrongly assumed that the disctinction made between legalSearchChars usecases was due to a difference in behavior between indexing and searching). Added more tests to prevent this from happening in the future. Bug: T167798 Change-Id: Ibdc796bb2881a2ed8194099d8c9f491980010f0f --- includes/deferred/SearchUpdate.php | 4 +- includes/search/SearchDatabase.php | 5 +- includes/search/SearchEngine.php | 22 ++++---- includes/search/SearchMssql.php | 2 +- includes/search/SearchMySQL.php | 11 ++-- includes/search/SearchOracle.php | 10 ++-- includes/search/SearchSqlite.php | 11 ++-- .../includes/search/SearchEngineTest.php | 52 ++++++++++++++++--- 8 files changed, 85 insertions(+), 32 deletions(-) diff --git a/includes/deferred/SearchUpdate.php b/includes/deferred/SearchUpdate.php index c94ae2a772..b9a259b1a4 100644 --- a/includes/deferred/SearchUpdate.php +++ b/includes/deferred/SearchUpdate.php @@ -124,7 +124,7 @@ class SearchUpdate implements DeferrableUpdate { # Language-specific strip/conversion $text = $wgContLang->normalizeForSearch( $text ); $se = $se ?: MediaWikiServices::getInstance()->newSearchEngine(); - $lc = $se->legalSearchCharsForUpdate() . '&#;'; + $lc = $se->legalSearchChars() . '&#;'; $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", ' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup @@ -207,7 +207,7 @@ class SearchUpdate implements DeferrableUpdate { $ns = $this->title->getNamespace(); $title = $this->title->getText(); - $lc = $search->legalSearchCharsForUpdate() . '&#;'; + $lc = $search->legalSearchChars() . '&#;'; $t = $wgContLang->normalizeForSearch( $title ); $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); $t = $wgContLang->lc( $t ); diff --git a/includes/search/SearchDatabase.php b/includes/search/SearchDatabase.php index d51e525b60..1d7a4a338e 100644 --- a/includes/search/SearchDatabase.php +++ b/includes/search/SearchDatabase.php @@ -53,7 +53,10 @@ class SearchDatabase extends SearchEngine { * @return string */ protected function filter( $text ) { - $lc = $this->legalSearchChars(); + // List of chars allowed in the search query. + // This must include chars used in the search syntax. + // Usually " (phrase) or * (wildcards) if supported by the engine + $lc = $this->legalSearchChars( self::CHARS_ALL ); return trim( preg_replace( "/[^{$lc}]/", " ", $text ) ); } } diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php index 7d05265bad..70117db76c 100644 --- a/includes/search/SearchEngine.php +++ b/includes/search/SearchEngine.php @@ -60,6 +60,12 @@ abstract class SearchEngine { /** @const string profile type for query independent ranking features */ const FT_QUERY_INDEP_PROFILE_TYPE = 'fulltextQueryIndepProfile'; + /** @const int flag for legalSearchChars: includes all chars allowed in a search query */ + const CHARS_ALL = 1; + + /** @const int flag for legalSearchChars: includes all chars allowed in a search term */ + const CHARS_NO_SYNTAX = 2; + /** * Perform a full text search query and return a result set. * If full text searches are not supported or disabled, return null. @@ -206,24 +212,16 @@ abstract class SearchEngine { } /** - * Get chars legal for search (at query time). + * Get chars legal for search * NOTE: usage as static is deprecated and preserved only as BC measure + * @param int $type type of search chars (see self::CHARS_ALL + * and self::CHARS_NO_SYNTAX). Defaults to CHARS_ALL * @return string */ - public static function legalSearchChars() { + public static function legalSearchChars( $type = self::CHARS_ALL ) { return "A-Za-z_'.0-9\\x80-\\xFF\\-"; } - /** - * Get chars legal for search (at index time). - * - * @since 1.30 - * @return string - */ - public function legalSearchCharsForUpdate() { - return static::legalSearchChars(); - } - /** * Set the maximum number of results to return * and how many to skip before returning the first. diff --git a/includes/search/SearchMssql.php b/includes/search/SearchMssql.php index 5e8fb044b6..57ca06e39f 100644 --- a/includes/search/SearchMssql.php +++ b/includes/search/SearchMssql.php @@ -136,7 +136,7 @@ class SearchMssql extends SearchDatabase { */ function parseQuery( $filteredText, $fulltext ) { global $wgContLang; - $lc = $this->legalSearchChars(); + $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); $this->searchTerms = []; # @todo FIXME: This doesn't handle parenthetical expressions. diff --git a/includes/search/SearchMySQL.php b/includes/search/SearchMySQL.php index 2c7feeb752..2ea960555b 100644 --- a/includes/search/SearchMySQL.php +++ b/includes/search/SearchMySQL.php @@ -45,7 +45,7 @@ class SearchMySQL extends SearchDatabase { function parseQuery( $filteredText, $fulltext ) { global $wgContLang; - $lc = $this->legalSearchChars(); // Minus format chars + $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *) $searchon = ''; $this->searchTerms = []; @@ -149,8 +149,13 @@ class SearchMySQL extends SearchDatabase { return $regex; } - public function legalSearchCharsForUpdate() { - return "\"*" . parent::legalSearchCharsForUpdate(); + public static function legalSearchChars( $type = self::CHARS_ALL ) { + $searchChars = parent::legalSearchChars( $type ); + if ( $type === self::CHARS_ALL ) { + // " for phrase, * for wildcard + $searchChars = "\"*" . $searchChars; + } + return $searchChars; } /** diff --git a/includes/search/SearchOracle.php b/includes/search/SearchOracle.php index 2e6cb84ca6..8bcd78fa66 100644 --- a/includes/search/SearchOracle.php +++ b/includes/search/SearchOracle.php @@ -174,7 +174,7 @@ class SearchOracle extends SearchDatabase { */ function parseQuery( $filteredText, $fulltext ) { global $wgContLang; - $lc = $this->legalSearchChars(); + $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); $this->searchTerms = []; # @todo FIXME: This doesn't handle parenthetical expressions. @@ -266,7 +266,11 @@ class SearchOracle extends SearchDatabase { [] ); } - public function legalSearchCharsForUpdate() { - return "\"" . parent::legalSearchCharsForUpdate(); + public static function legalSearchChars( $type = self::CHARS_ALL ) { + $searchChars = parent::legalSearchChars( $type ); + if ( $type === self::CHARS_ALL ) { + $searchChars = "\"" . $searchChars; + } + return $searchChars; } } diff --git a/includes/search/SearchSqlite.php b/includes/search/SearchSqlite.php index 5a8995d745..2c82c7d9a5 100644 --- a/includes/search/SearchSqlite.php +++ b/includes/search/SearchSqlite.php @@ -44,7 +44,7 @@ class SearchSqlite extends SearchDatabase { */ function parseQuery( $filteredText, $fulltext ) { global $wgContLang; - $lc = $this->legalSearchChars(); // Minus format chars + $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *) $searchon = ''; $this->searchTerms = []; @@ -141,8 +141,13 @@ class SearchSqlite extends SearchDatabase { return $regex; } - public function legalSearchCharsForUpdate() { - return "\"*" . parent::legalSearchCharsForUpdate(); + public static function legalSearchChars( $type = self::CHARS_ALL ) { + $searchChars = parent::legalSearchChars( $type ); + if ( $type === self::CHARS_ALL ) { + // " for phrase, * for wildcard + $searchChars = "\"*" . $searchChars; + } + return $searchChars; } /** diff --git a/tests/phpunit/includes/search/SearchEngineTest.php b/tests/phpunit/includes/search/SearchEngineTest.php index c945d1e217..9711eabb31 100644 --- a/tests/phpunit/includes/search/SearchEngineTest.php +++ b/tests/phpunit/includes/search/SearchEngineTest.php @@ -124,20 +124,58 @@ class SearchEngineTest extends MediaWikiLangTestCase { "Plain search" ); } + public function testWildcardSearch() { + $res = $this->search->searchText( 'smith*' ); + $this->assertEquals( + [ 'Smithee' ], + $this->fetchIds( $res ), + "Search with wildcards" ); + + $res = $this->search->searchText( 'smithson*' ); + $this->assertEquals( + [], + $this->fetchIds( $res ), + "Search with wildcards must not find unrelated articles" ); + + $res = $this->search->searchText( 'smith* smithee' ); + $this->assertEquals( + [ 'Smithee' ], + $this->fetchIds( $res ), + "Search with wildcards can be combined with simple terms" ); + + $res = $this->search->searchText( 'smith* "one who smiths"' ); + $this->assertEquals( + [ 'Smithee' ], + $this->fetchIds( $res ), + "Search with wildcards can be combined with phrase search" ); + } + public function testPhraseSearch() { $res = $this->search->searchText( '"smithee is one who smiths"' ); $this->assertEquals( [ 'Smithee' ], $this->fetchIds( $res ), "Search a phrase" ); - $res = $this->search->searchText( '"smithee is one who smiths"' ); + + $res = $this->search->searchText( '"smithee is who smiths"' ); + $this->assertEquals( + [], + $this->fetchIds( $res ), + "Phrase search is not sloppy, search terms must be adjacent" ); + + $res = $this->search->searchText( '"is smithee one who smiths"' ); + $this->assertEquals( + [], + $this->fetchIds( $res ), + "Phrase search is ordered" ); + } + + public function testPhraseSearchHighlight() { + $phrase = "smithee is one who smiths"; + $res = $this->search->searchText( "\"$phrase\"" ); $match = $res->next(); - $terms = [ 'smithee', 'is', 'one', 'who', 'smiths' ]; - $snippet = ""; - foreach ( $terms as $term ) { - $snippet .= " " . $term . ""; - } - $this->assertRegexp( '/' . preg_quote( $snippet, '/' ) . '/', + $snippet = "A " . $phrase . ""; + $this->assertStringStartsWith( $snippet, $match->getTextSnippet( $res->termMatches() ), "Highlight a phrase search" ); } -- 2.20.1