From 64ee3d32692b975249e51a8402bd8382e60031ac Mon Sep 17 00:00:00 2001 From: aude Date: Tue, 9 Aug 2016 13:22:09 -0400 Subject: [PATCH] Extract ParserOutput search index data fields from WikiTextContentHandler Bug: T142491 Change-Id: I69b010b893135e53fac7f16f4b927b8fbcba06d2 --- autoload.php | 2 + includes/content/ContentHandler.php | 67 ++++++++++---- includes/content/WikiTextStructure.php | 44 --------- includes/content/WikitextContentHandler.php | 18 ---- .../DummySearchIndexFieldDefinition.php | 30 ++++++ .../ParserOutputSearchDataExtractor.php | 92 +++++++++++++++++++ .../search/SearchIndexFieldDefinition.php | 14 ++- .../includes/content/ContentHandlerTest.php | 28 +++++- .../content/WikitextStructureTest.php | 55 ----------- .../ParserOutputSearchDataExtractorTest.php | 70 ++++++++++++++ 10 files changed, 281 insertions(+), 139 deletions(-) create mode 100644 includes/search/DummySearchIndexFieldDefinition.php create mode 100644 includes/search/ParserOutputSearchDataExtractor.php create mode 100644 tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php diff --git a/autoload.php b/autoload.php index f6edd94b7d..f75c5aff8b 100644 --- a/autoload.php +++ b/autoload.php @@ -372,6 +372,7 @@ $wgAutoloadLocalClasses = [ 'DoubleRedirectsPage' => __DIR__ . '/includes/specials/SpecialDoubleRedirects.php', 'DoubleReplacer' => __DIR__ . '/includes/libs/replacers/DoubleReplacer.php', 'DummyLinker' => __DIR__ . '/includes/DummyLinker.php', + 'DummySearchIndexFieldDefinition' => __DIR__ . '/includes/search/DummySearchIndexFieldDefinition.php', 'DummyTermColorer' => __DIR__ . '/maintenance/term/MWTerm.php', 'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php', 'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php', @@ -856,6 +857,7 @@ $wgAutoloadLocalClasses = [ 'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php', 'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php', 'MediaWiki\\MediaWikiServices' => __DIR__ . '/includes/MediaWikiServices.php', + 'MediaWiki\\Search\\ParserOutputSearchDataExtractor' => __DIR__ . '/includes/search/ParserOutputSearchDataExtractor.php', 'MediaWiki\\Services\\CannotReplaceActiveServiceException' => __DIR__ . '/includes/Services/CannotReplaceActiveServiceException.php', 'MediaWiki\\Services\\ContainerDisabledException' => __DIR__ . '/includes/Services/ContainerDisabledException.php', 'MediaWiki\\Services\\DestructibleService' => __DIR__ . '/includes/Services/DestructibleService.php', diff --git a/includes/content/ContentHandler.php b/includes/content/ContentHandler.php index 7184980ede..3a75f50623 100644 --- a/includes/content/ContentHandler.php +++ b/includes/content/ContentHandler.php @@ -1,4 +1,7 @@ makeSearchFieldMapping( + 'category', + SearchIndexField::INDEX_TYPE_TEXT + ); + + $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD ); + + $fields['external_link'] = $engine->makeSearchFieldMapping( + 'external_link', + SearchIndexField::INDEX_TYPE_KEYWORD + ); + + $fields['outgoing_link'] = $engine->makeSearchFieldMapping( + 'outgoing_link', + SearchIndexField::INDEX_TYPE_KEYWORD + ); + + $fields['template'] = $engine->makeSearchFieldMapping( + 'template', + SearchIndexField::INDEX_TYPE_KEYWORD + ); + + $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD ); + + return $fields; } /** @@ -1298,16 +1317,26 @@ abstract class ContentHandler { */ public function getDataForSearchIndex( WikiPage $page, ParserOutput $output, SearchEngine $engine ) { - $fields = []; + $fieldData = []; $content = $page->getContent(); + if ( $content ) { + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + $fieldData['category'] = $searchDataExtractor->getCategories( $output ); + $fieldData['external_link'] = $searchDataExtractor->getExternalLinks( $output ); + $fieldData['outgoing_link'] = $searchDataExtractor->getOutgoingLinks( $output ); + $fieldData['template'] = $searchDataExtractor->getTemplates( $output ); + $text = $content->getTextForSearchIndex(); - $fields['text'] = $text; - $fields['source_text'] = $text; - $fields['text_bytes'] = $content->getSize(); + + $fieldData['text'] = $text; + $fieldData['source_text'] = $text; + $fieldData['text_bytes'] = $content->getSize(); } - Hooks::run( 'SearchDataForIndex', [ &$fields, $this, $page, $output, $engine ] ); - return $fields; + + Hooks::run( 'SearchDataForIndex', [ &$fieldData, $this, $page, $output, $engine ] ); + return $fieldData; } /** diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php index e83c213c0e..9768d3666e 100644 --- a/includes/content/WikiTextStructure.php +++ b/includes/content/WikiTextStructure.php @@ -58,50 +58,6 @@ class WikiTextStructure { $this->parserOutput = $parserOutput; } - /** - * Get categories in the text. - * @return string[] - */ - public function categories() { - $categories = []; - foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) { - $categories[] = Category::newFromName( $key )->getTitle()->getText(); - } - return $categories; - } - - /** - * Get outgoing links. - * @return string[] - */ - public function outgoingLinks() { - $outgoingLinks = []; - foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) { - foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) { - $outgoingLinks[] = - Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey(); - } - } - return $outgoingLinks; - } - - /** - * Get templates in the text. - * @return string[] - */ - public function templates() { - $templates = []; - foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) { - foreach ( array_keys( $templatesInNS ) as $tDbKey ) { - $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey ); - if ( $templateTitle && $templateTitle->exists() ) { - $templates[] = $templateTitle->getPrefixedText(); - } - } - } - return $templates; - } - /** * Get headings on the page. * @return string[] diff --git a/includes/content/WikitextContentHandler.php b/includes/content/WikitextContentHandler.php index 9baf64331d..e51d24625a 100644 --- a/includes/content/WikitextContentHandler.php +++ b/includes/content/WikitextContentHandler.php @@ -111,13 +111,6 @@ class WikitextContentHandler extends TextContentHandler { public function getFieldsForSearchIndex( SearchEngine $engine ) { $fields = parent::getFieldsForSearchIndex( $engine ); - $fields['category'] = - $engine->makeSearchFieldMapping( 'category', SearchIndexField::INDEX_TYPE_TEXT ); - $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD ); - - $fields['external_link'] = - $engine->makeSearchFieldMapping( 'external_link', SearchIndexField::INDEX_TYPE_KEYWORD ); - $fields['heading'] = $engine->makeSearchFieldMapping( 'heading', SearchIndexField::INDEX_TYPE_TEXT ); $fields['heading']->setFlag( SearchIndexField::FLAG_SCORING ); @@ -130,13 +123,6 @@ class WikitextContentHandler extends TextContentHandler { $fields['opening_text']->setFlag( SearchIndexField::FLAG_SCORING | SearchIndexField::FLAG_NO_HIGHLIGHT ); - $fields['outgoing_link'] = - $engine->makeSearchFieldMapping( 'outgoing_link', SearchIndexField::INDEX_TYPE_KEYWORD ); - - $fields['template'] = - $engine->makeSearchFieldMapping( 'template', SearchIndexField::INDEX_TYPE_KEYWORD ); - $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD ); - // FIXME: this really belongs in separate file handler but files // do not have separate handler. Sadness. $fields['file_text'] = @@ -165,11 +151,7 @@ class WikitextContentHandler extends TextContentHandler { $fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine ); $structure = new WikiTextStructure( $parserOutput ); - $fields['external_link'] = array_keys( $parserOutput->getExternalLinks() ); - $fields['category'] = $structure->categories(); $fields['heading'] = $structure->headings(); - $fields['outgoing_link'] = $structure->outgoingLinks(); - $fields['template'] = $structure->templates(); // text fields $fields['opening_text'] = $structure->getOpeningText(); $fields['text'] = $structure->getMainText(); // overwrites one from ContentHandler diff --git a/includes/search/DummySearchIndexFieldDefinition.php b/includes/search/DummySearchIndexFieldDefinition.php new file mode 100644 index 0000000000..a2a6760192 --- /dev/null +++ b/includes/search/DummySearchIndexFieldDefinition.php @@ -0,0 +1,30 @@ + $this->name, + 'type' => $this->type, + 'flags' => $this->flags, + 'subfields' => [] + ]; + + foreach ( $this->subfields as $subfield ) { + $mapping['subfields'][] = $subfield->getMapping(); + } + + return $mapping; + } + +} diff --git a/includes/search/ParserOutputSearchDataExtractor.php b/includes/search/ParserOutputSearchDataExtractor.php new file mode 100644 index 0000000000..df653f1240 --- /dev/null +++ b/includes/search/ParserOutputSearchDataExtractor.php @@ -0,0 +1,92 @@ +getCategoryLinks() as $key ) { + $categories[] = Category::newFromName( $key )->getTitle()->getText(); + } + + return $categories; + } + + /** + * Get a list of external links from ParserOutput, as an array of strings. + * + * @return string[] + */ + public function getExternalLinks( ParserOutput $parserOutput ) { + return array_keys( $parserOutput->getExternalLinks() ); + } + + /** + * Get a list of outgoing wiki links (including interwiki links), as + * an array of prefixed title strings. + * + * @return string[] + */ + public function getOutgoingLinks( ParserOutput $parserOutput ) { + $outgoingLinks = []; + + foreach ( $parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) { + foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) { + $outgoingLinks[] = + Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey(); + } + } + + return $outgoingLinks; + } + + /** + * Get a list of templates used in the ParserOutput content, as prefixed title strings + * + * @return string[] + */ + public function getTemplates( ParserOutput $parserOutput ) { + $templates = []; + + foreach ( $parserOutput->getTemplates() as $tNS => $templatesInNS ) { + foreach ( array_keys( $templatesInNS ) as $tDbKey ) { + $templateTitle = Title::makeTitle( $tNS, $tDbKey ); + $templates[] = $templateTitle->getPrefixedText(); + } + } + + return $templates; + } + +} diff --git a/includes/search/SearchIndexFieldDefinition.php b/includes/search/SearchIndexFieldDefinition.php index 3a86c82d0d..8a06b65ed7 100644 --- a/includes/search/SearchIndexFieldDefinition.php +++ b/includes/search/SearchIndexFieldDefinition.php @@ -2,8 +2,10 @@ /** * Basic infrastructure of the field definition. - * Specific engines will need to override it at least for getMapping, - * but can reuse other parts. + * + * Specific engines should extend this class and at at least, + * override the getMapping method, but can reuse other parts. + * * @since 1.28 */ abstract class SearchIndexFieldDefinition implements SearchIndexField { @@ -115,4 +117,12 @@ abstract class SearchIndexFieldDefinition implements SearchIndexField { $this->subfields = $subfields; return $this; } + + /** + * @param SearchEngine $engine + * + * @return array + */ + abstract public function getMapping( SearchEngine $engine ); + } diff --git a/tests/phpunit/includes/content/ContentHandlerTest.php b/tests/phpunit/includes/content/ContentHandlerTest.php index bb9050fbd3..561f211fc4 100644 --- a/tests/phpunit/includes/content/ContentHandlerTest.php +++ b/tests/phpunit/includes/content/ContentHandlerTest.php @@ -415,6 +415,32 @@ class ContentHandlerTest extends MediaWikiTestCase { $this->assertInstanceOf( $handlerClass, $handler ); } + public function testGetFieldsForSearchIndex() { + $searchEngine = $this->newSearchEngine(); + + $handler = ContentHandler::getForModelID( CONTENT_MODEL_WIKITEXT ); + + $fields = $handler->getFieldsForSearchIndex( $searchEngine ); + + $this->assertArrayHasKey( 'category', $fields ); + $this->assertArrayHasKey( 'external_link', $fields ); + $this->assertArrayHasKey( 'outgoing_link', $fields ); + $this->assertArrayHasKey( 'template', $fields ); + } + + private function newSearchEngine() { + $searchEngine = $this->getMockBuilder( 'SearchEngine' ) + ->getMock(); + + $searchEngine->expects( $this->any() ) + ->method( 'makeSearchFieldMapping' ) + ->will( $this->returnCallback( function( $name, $type ) { + return new DummySearchIndexFieldDefinition( $name, $type ); + } ) ); + + return $searchEngine; + } + /** * @covers ContentHandler::getDataForSearchIndex */ @@ -425,7 +451,7 @@ class ContentHandlerTest extends MediaWikiTestCase { $this->setTemporaryHook( 'SearchDataForIndex', function ( &$fields, ContentHandler $handler, WikiPage $page, ParserOutput $output, - SearchEngine $engine ) { + SearchEngine $engine ) { $fields['testDataField'] = 'test content'; } ); diff --git a/tests/phpunit/includes/content/WikitextStructureTest.php b/tests/phpunit/includes/content/WikitextStructureTest.php index 6d83057b7e..4301fb8c35 100644 --- a/tests/phpunit/includes/content/WikitextStructureTest.php +++ b/tests/phpunit/includes/content/WikitextStructureTest.php @@ -25,61 +25,6 @@ class WikitextStructureTest extends MediaWikiLangTestCase { return new WikiTextStructure( $this->getParserOutput( $text ) ); } - public function testCategories() { - $text = <<getStructure( $text ); - $cats = $struct->categories(); - $this->assertCount( 2, $cats ); - $this->assertContains( "Some Category", $cats ); - $this->assertContains( "Yet another category", $cats ); - } - - public function testOutgoingLinks() { - $text = <<getStructure( $text ); - $links = $struct->outgoingLinks(); - $this->assertContains( "Some_Page", $links ); - $this->assertContains( "Template:Template", $links ); - $this->assertContains( "Template:Another_template", $links ); - $this->assertContains( "Template:Lowercase", $links ); - $this->assertContains( "Talk:TestTitle", $links ); - $this->assertCount( 5, $links ); - } - - public function testTemplates() { - $text = <<setTemporaryHook( 'TitleExists', function ( Title $title, &$exists ) { - $txt = $title->getBaseText(); - if ( $txt[0] != 'X' ) { - $exists = true; - } - return true; - } ); - $struct = $this->getStructure( $text ); - $templates = $struct->templates(); - $this->assertCount( 3, $templates ); - $this->assertContains( "Template:Template", $templates ); - $this->assertContains( "Template:Another template", $templates ); - $this->assertContains( "Template:Lowercase", $templates ); - } - public function testHeadings() { $text = << 'Bar', + 'New_page' => '' + ]; + + $parserOutput = new ParserOutput( '', [], $categories ); + + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + $this->assertEquals( + [ 'Foo bar', 'New page' ], + $searchDataExtractor->getCategories( $parserOutput ) + ); + } + + public function testGetExternalLinks() { + $parserOutput = new ParserOutput(); + + $parserOutput->addExternalLink( 'https://foo' ); + $parserOutput->addExternalLink( 'https://bar' ); + + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + $this->assertEquals( + [ 'https://foo', 'https://bar' ], + $searchDataExtractor->getExternalLinks( $parserOutput ) + ); + } + + public function testGetOutgoingLinks() { + $parserOutput = new ParserOutput(); + + $parserOutput->addLink( Title::makeTitle( NS_MAIN, 'Foo_bar' ), 1 ); + $parserOutput->addLink( Title::makeTitle( NS_HELP, 'Contents' ), 2 ); + + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + // this indexes links with db key + $this->assertEquals( + [ 'Foo_bar', 'Help:Contents' ], + $searchDataExtractor->getOutgoingLinks( $parserOutput ) + ); + } + + public function testGetTemplates() { + $title = Title::makeTitle( NS_TEMPLATE, 'Cite_news' ); + + $parserOutput = new ParserOutput(); + $parserOutput->addTemplate( $title, 10, 100 ); + + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + $this->assertEquals( + [ 'Template:Cite news' ], + $searchDataExtractor->getTemplates( $parserOutput ) + ); + } + +} -- 2.20.1