'DoubleRedirectsPage' => __DIR__ . '/includes/specials/SpecialDoubleRedirects.php',
'DoubleReplacer' => __DIR__ . '/includes/libs/replacers/DoubleReplacer.php',
'DummyLinker' => __DIR__ . '/includes/DummyLinker.php',
+ 'DummySearchIndexFieldDefinition' => __DIR__ . '/includes/search/DummySearchIndexFieldDefinition.php',
'DummyTermColorer' => __DIR__ . '/maintenance/term/MWTerm.php',
'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php',
'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php',
'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php',
'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
'MediaWiki\\MediaWikiServices' => __DIR__ . '/includes/MediaWikiServices.php',
+ 'MediaWiki\\Search\\ParserOutputSearchDataExtractor' => __DIR__ . '/includes/search/ParserOutputSearchDataExtractor.php',
'MediaWiki\\Services\\CannotReplaceActiveServiceException' => __DIR__ . '/includes/Services/CannotReplaceActiveServiceException.php',
'MediaWiki\\Services\\ContainerDisabledException' => __DIR__ . '/includes/Services/ContainerDisabledException.php',
'MediaWiki\\Services\\DestructibleService' => __DIR__ . '/includes/Services/DestructibleService.php',
<?php
+
+use MediaWiki\Search\ParserOutputSearchDataExtractor;
+
/**
* Base class for content handling.
*
/**
* Get fields definition for search index
+ *
+ * @todo Expose title, redirect, namespace, text, source_text, text_bytes
+ * field mappings here. (see T142670 and T143409)
+ *
* @param SearchEngine $engine
* @return SearchIndexField[] List of fields this content handler can provide.
* @since 1.28
*/
public function getFieldsForSearchIndex( SearchEngine $engine ) {
- /* Default fields:
- /*
- * namespace
- * namespace_text
- * redirect
- * source_text
- * suggest
- * timestamp
- * title
- * text
- * text_bytes
- */
- return [];
+ $fields['category'] = $engine->makeSearchFieldMapping(
+ 'category',
+ SearchIndexField::INDEX_TYPE_TEXT
+ );
+
+ $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD );
+
+ $fields['external_link'] = $engine->makeSearchFieldMapping(
+ 'external_link',
+ SearchIndexField::INDEX_TYPE_KEYWORD
+ );
+
+ $fields['outgoing_link'] = $engine->makeSearchFieldMapping(
+ 'outgoing_link',
+ SearchIndexField::INDEX_TYPE_KEYWORD
+ );
+
+ $fields['template'] = $engine->makeSearchFieldMapping(
+ 'template',
+ SearchIndexField::INDEX_TYPE_KEYWORD
+ );
+
+ $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD );
+
+ return $fields;
}
/**
*/
public function getDataForSearchIndex( WikiPage $page, ParserOutput $output,
SearchEngine $engine ) {
- $fields = [];
+ $fieldData = [];
$content = $page->getContent();
+
if ( $content ) {
+ $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+ $fieldData['category'] = $searchDataExtractor->getCategories( $output );
+ $fieldData['external_link'] = $searchDataExtractor->getExternalLinks( $output );
+ $fieldData['outgoing_link'] = $searchDataExtractor->getOutgoingLinks( $output );
+ $fieldData['template'] = $searchDataExtractor->getTemplates( $output );
+
$text = $content->getTextForSearchIndex();
- $fields['text'] = $text;
- $fields['source_text'] = $text;
- $fields['text_bytes'] = $content->getSize();
+
+ $fieldData['text'] = $text;
+ $fieldData['source_text'] = $text;
+ $fieldData['text_bytes'] = $content->getSize();
}
- Hooks::run( 'SearchDataForIndex', [ &$fields, $this, $page, $output, $engine ] );
- return $fields;
+
+ Hooks::run( 'SearchDataForIndex', [ &$fieldData, $this, $page, $output, $engine ] );
+ return $fieldData;
}
/**
$this->parserOutput = $parserOutput;
}
- /**
- * Get categories in the text.
- * @return string[]
- */
- public function categories() {
- $categories = [];
- foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) {
- $categories[] = Category::newFromName( $key )->getTitle()->getText();
- }
- return $categories;
- }
-
- /**
- * Get outgoing links.
- * @return string[]
- */
- public function outgoingLinks() {
- $outgoingLinks = [];
- foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
- foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
- $outgoingLinks[] =
- Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
- }
- }
- return $outgoingLinks;
- }
-
- /**
- * Get templates in the text.
- * @return string[]
- */
- public function templates() {
- $templates = [];
- foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) {
- foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
- $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey );
- if ( $templateTitle && $templateTitle->exists() ) {
- $templates[] = $templateTitle->getPrefixedText();
- }
- }
- }
- return $templates;
- }
-
/**
* Get headings on the page.
* @return string[]
public function getFieldsForSearchIndex( SearchEngine $engine ) {
$fields = parent::getFieldsForSearchIndex( $engine );
- $fields['category'] =
- $engine->makeSearchFieldMapping( 'category', SearchIndexField::INDEX_TYPE_TEXT );
- $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD );
-
- $fields['external_link'] =
- $engine->makeSearchFieldMapping( 'external_link', SearchIndexField::INDEX_TYPE_KEYWORD );
-
$fields['heading'] =
$engine->makeSearchFieldMapping( 'heading', SearchIndexField::INDEX_TYPE_TEXT );
$fields['heading']->setFlag( SearchIndexField::FLAG_SCORING );
$fields['opening_text']->setFlag( SearchIndexField::FLAG_SCORING |
SearchIndexField::FLAG_NO_HIGHLIGHT );
- $fields['outgoing_link'] =
- $engine->makeSearchFieldMapping( 'outgoing_link', SearchIndexField::INDEX_TYPE_KEYWORD );
-
- $fields['template'] =
- $engine->makeSearchFieldMapping( 'template', SearchIndexField::INDEX_TYPE_KEYWORD );
- $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD );
-
// FIXME: this really belongs in separate file handler but files
// do not have separate handler. Sadness.
$fields['file_text'] =
$fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine );
$structure = new WikiTextStructure( $parserOutput );
- $fields['external_link'] = array_keys( $parserOutput->getExternalLinks() );
- $fields['category'] = $structure->categories();
$fields['heading'] = $structure->headings();
- $fields['outgoing_link'] = $structure->outgoingLinks();
- $fields['template'] = $structure->templates();
// text fields
$fields['opening_text'] = $structure->getOpeningText();
$fields['text'] = $structure->getMainText(); // overwrites one from ContentHandler
--- /dev/null
+<?php
+
+/**
+ * Dummy implementation of SearchIndexFieldDefinition for testing purposes.
+ *
+ * @since 1.28
+ */
+class DummySearchIndexFieldDefinition extends SearchIndexFieldDefinition {
+
+ /**
+ * @param SearchEngine $engine
+ *
+ * @return array
+ */
+ public function getMapping( SearchEngine $engine ) {
+ $mapping = [
+ 'name' => $this->name,
+ 'type' => $this->type,
+ 'flags' => $this->flags,
+ 'subfields' => []
+ ];
+
+ foreach ( $this->subfields as $subfield ) {
+ $mapping['subfields'][] = $subfield->getMapping();
+ }
+
+ return $mapping;
+ }
+
+}
--- /dev/null
+<?php
+
+namespace MediaWiki\Search;
+
+use Category;
+use ParserOutput;
+use Title;
+
+/**
+ * Extracts data from ParserOutput for indexing in the search engine.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.28
+ */
+class ParserOutputSearchDataExtractor {
+
+ /**
+ * Get a list of categories, as an array with title text strings.
+ *
+ * @return string[]
+ */
+ public function getCategories( ParserOutput $parserOutput ) {
+ $categories = [];
+
+ foreach ( $parserOutput->getCategoryLinks() as $key ) {
+ $categories[] = Category::newFromName( $key )->getTitle()->getText();
+ }
+
+ return $categories;
+ }
+
+ /**
+ * Get a list of external links from ParserOutput, as an array of strings.
+ *
+ * @return string[]
+ */
+ public function getExternalLinks( ParserOutput $parserOutput ) {
+ return array_keys( $parserOutput->getExternalLinks() );
+ }
+
+ /**
+ * Get a list of outgoing wiki links (including interwiki links), as
+ * an array of prefixed title strings.
+ *
+ * @return string[]
+ */
+ public function getOutgoingLinks( ParserOutput $parserOutput ) {
+ $outgoingLinks = [];
+
+ foreach ( $parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
+ foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
+ $outgoingLinks[] =
+ Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
+ }
+ }
+
+ return $outgoingLinks;
+ }
+
+ /**
+ * Get a list of templates used in the ParserOutput content, as prefixed title strings
+ *
+ * @return string[]
+ */
+ public function getTemplates( ParserOutput $parserOutput ) {
+ $templates = [];
+
+ foreach ( $parserOutput->getTemplates() as $tNS => $templatesInNS ) {
+ foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
+ $templateTitle = Title::makeTitle( $tNS, $tDbKey );
+ $templates[] = $templateTitle->getPrefixedText();
+ }
+ }
+
+ return $templates;
+ }
+
+}
/**
* Basic infrastructure of the field definition.
- * Specific engines will need to override it at least for getMapping,
- * but can reuse other parts.
+ *
+ * Specific engines should extend this class and at at least,
+ * override the getMapping method, but can reuse other parts.
+ *
* @since 1.28
*/
abstract class SearchIndexFieldDefinition implements SearchIndexField {
$this->subfields = $subfields;
return $this;
}
+
+ /**
+ * @param SearchEngine $engine
+ *
+ * @return array
+ */
+ abstract public function getMapping( SearchEngine $engine );
+
}
$this->assertInstanceOf( $handlerClass, $handler );
}
+ public function testGetFieldsForSearchIndex() {
+ $searchEngine = $this->newSearchEngine();
+
+ $handler = ContentHandler::getForModelID( CONTENT_MODEL_WIKITEXT );
+
+ $fields = $handler->getFieldsForSearchIndex( $searchEngine );
+
+ $this->assertArrayHasKey( 'category', $fields );
+ $this->assertArrayHasKey( 'external_link', $fields );
+ $this->assertArrayHasKey( 'outgoing_link', $fields );
+ $this->assertArrayHasKey( 'template', $fields );
+ }
+
+ private function newSearchEngine() {
+ $searchEngine = $this->getMockBuilder( 'SearchEngine' )
+ ->getMock();
+
+ $searchEngine->expects( $this->any() )
+ ->method( 'makeSearchFieldMapping' )
+ ->will( $this->returnCallback( function( $name, $type ) {
+ return new DummySearchIndexFieldDefinition( $name, $type );
+ } ) );
+
+ return $searchEngine;
+ }
+
/**
* @covers ContentHandler::getDataForSearchIndex
*/
$this->setTemporaryHook( 'SearchDataForIndex',
function ( &$fields, ContentHandler $handler, WikiPage $page, ParserOutput $output,
- SearchEngine $engine ) {
+ SearchEngine $engine ) {
$fields['testDataField'] = 'test content';
} );
return new WikiTextStructure( $this->getParserOutput( $text ) );
}
- public function testCategories() {
- $text = <<<END
-We also have a {{Template}} and an {{Another template}} in addition.
-This text also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
-And [[Category:Some Category| this category]] is repeated.
-END;
- $struct = $this->getStructure( $text );
- $cats = $struct->categories();
- $this->assertCount( 2, $cats );
- $this->assertContains( "Some Category", $cats );
- $this->assertContains( "Yet another category", $cats );
- }
-
- public function testOutgoingLinks() {
- $text = <<<END
-Here I add link to [[Some Page]]. And [[Some Page|This same page]] gets linked twice.
-We also have [[File:Image.jpg|image]].
-We also have a {{Template}} and an {{Another template}} in addition.
-Some templates are {{lowercase}}.
-And [[Some_Page]] is linked again.
-It also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
-Also link to a [[Talk:TestTitle|talk page]] is here.
-END;
- $struct = $this->getStructure( $text );
- $links = $struct->outgoingLinks();
- $this->assertContains( "Some_Page", $links );
- $this->assertContains( "Template:Template", $links );
- $this->assertContains( "Template:Another_template", $links );
- $this->assertContains( "Template:Lowercase", $links );
- $this->assertContains( "Talk:TestTitle", $links );
- $this->assertCount( 5, $links );
- }
-
- public function testTemplates() {
- $text = <<<END
-We have a {{Template}} and an {{Another template}} in addition.
-Some templates are {{lowercase}}. And this {{Template}} is repeated.
-Here is {{another_template|with=argument}}.
-This is a template that {{Xdoes not exist}}.
-END;
- $this->setTemporaryHook( 'TitleExists', function ( Title $title, &$exists ) {
- $txt = $title->getBaseText();
- if ( $txt[0] != 'X' ) {
- $exists = true;
- }
- return true;
- } );
- $struct = $this->getStructure( $text );
- $templates = $struct->templates();
- $this->assertCount( 3, $templates );
- $this->assertContains( "Template:Template", $templates );
- $this->assertContains( "Template:Another template", $templates );
- $this->assertContains( "Template:Lowercase", $templates );
- }
-
public function testHeadings() {
$text = <<<END
Some text here
--- /dev/null
+<?php
+
+use MediaWiki\Search\ParserOutputSearchDataExtractor;
+
+/**
+ * @group Search
+ * @covers MediaWiki\Search\ParserOutputSearchDataExtractor
+ */
+class ParserOutputSearchDataExtractorTest extends MediaWikiLangTestCase {
+
+ public function testGetCategories() {
+ $categories = [
+ 'Foo_bar' => 'Bar',
+ 'New_page' => ''
+ ];
+
+ $parserOutput = new ParserOutput( '', [], $categories );
+
+ $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+ $this->assertEquals(
+ [ 'Foo bar', 'New page' ],
+ $searchDataExtractor->getCategories( $parserOutput )
+ );
+ }
+
+ public function testGetExternalLinks() {
+ $parserOutput = new ParserOutput();
+
+ $parserOutput->addExternalLink( 'https://foo' );
+ $parserOutput->addExternalLink( 'https://bar' );
+
+ $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+ $this->assertEquals(
+ [ 'https://foo', 'https://bar' ],
+ $searchDataExtractor->getExternalLinks( $parserOutput )
+ );
+ }
+
+ public function testGetOutgoingLinks() {
+ $parserOutput = new ParserOutput();
+
+ $parserOutput->addLink( Title::makeTitle( NS_MAIN, 'Foo_bar' ), 1 );
+ $parserOutput->addLink( Title::makeTitle( NS_HELP, 'Contents' ), 2 );
+
+ $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+ // this indexes links with db key
+ $this->assertEquals(
+ [ 'Foo_bar', 'Help:Contents' ],
+ $searchDataExtractor->getOutgoingLinks( $parserOutput )
+ );
+ }
+
+ public function testGetTemplates() {
+ $title = Title::makeTitle( NS_TEMPLATE, 'Cite_news' );
+
+ $parserOutput = new ParserOutput();
+ $parserOutput->addTemplate( $title, 10, 100 );
+
+ $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+ $this->assertEquals(
+ [ 'Template:Cite news' ],
+ $searchDataExtractor->getTemplates( $parserOutput )
+ );
+ }
+
+}