From 74c0e04c1aade5347c1482c7ea8da914430398e8 Mon Sep 17 00:00:00 2001 From: Chad Horohoe Date: Thu, 20 Jun 2013 18:50:31 -0400 Subject: [PATCH] Allow SearchUpdate hook to abort core update call Going to use this in CirrusSearch since the default text handling is insane for Solr. While we're at it, further move content handling to SearchEngine so children can override behavior here. Change-Id: I09d11b81c224d53609c57d75d54021e697b56629 --- docs/hooks.txt | 7 ++--- includes/search/SearchEngine.php | 21 +++++++++++---- includes/search/SearchUpdate.php | 45 ++++++++++++++------------------ 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/docs/hooks.txt b/docs/hooks.txt index e5444ce384..f2c10ce834 100644 --- a/docs/hooks.txt +++ b/docs/hooks.txt @@ -1949,11 +1949,12 @@ $data: the data stored in old_text. The meaning depends on $flags: if external $flags: a comma-delimited list of strings representing the options used. May include: utf8 (this will always be set for new revisions); gzip; external. -'SearchUpdate': Prior to search update completion. +'SearchUpdate': Prior to search update completion. Return false to stop any +further text/content processing $id : Page id -$namespace : Page namespace -$title : Page title +$title : Title object $text : Current text being indexed +$content : Content object for text being indexed. 'SearchGetNearMatchBefore': Perform exact-title-matches in "go" searches before the normal operations. diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php index bbd888698e..4f4e31d277 100644 --- a/includes/search/SearchEngine.php +++ b/includes/search/SearchEngine.php @@ -523,6 +523,20 @@ class SearchEngine { return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns; } } + + /** + * Get the raw text for updating the index from a content object + * Nicer search backends could possibly do something cooler than + * just returning raw text + * + * @todo This isn't ideal, we'd really like to have content-specific handling here + * @param Title $t Title we're indexing + * @param Content $c Content of the page to index + * @return string + */ + public function getTextFromContent( Title $t, Content $c = null ) { + return $c ? $c->getTextForSearchIndex() : ''; + } } /** @@ -815,11 +829,8 @@ class SearchResult { protected function initText() { if ( !isset( $this->mText ) ) { if ( $this->mRevision != null ) { - //TODO: if we could plug in some code that knows about special content models *and* about - // special features of the search engine, the search could benefit. See similar - // comment in SearchUpdate's constructor - $content = $this->mRevision->getContent(); - $this->mText = $content ? $content->getTextForSearchIndex() : ''; + $this->mText = SearchEngine::create() + ->getTextFromContent( $this->mTitle, $this->mRevision->getContent() ); } else { // TODO: can we fetch raw wikitext for commons images? $this->mText = ''; } diff --git a/includes/search/SearchUpdate.php b/includes/search/SearchUpdate.php index 10dc0a09d3..714691706a 100644 --- a/includes/search/SearchUpdate.php +++ b/includes/search/SearchUpdate.php @@ -36,32 +36,27 @@ class SearchUpdate implements DeferrableUpdate { private $id = 0; /** - * Namespace of page being updated - * @var int - */ - private $namespace; - - /** - * Title we're updating (without namespace) - * @var string + * Title we're updating + * @var Title */ private $title; /** - * Raw text to put into the index + * Content of the page (not text) + * @var Content|false */ - private $text; + private $content; /** * Constructor * * @param int $id Page id to update * @param Title|string $title Title of page to update - * @param Content|string|false $content Content of the page to update. + * @param Content|string|false $c Content of the page to update. * If a Content object, text will be gotten from it. String is for back-compat. * Passing false tells the backend to just update the title, not the content */ - public function __construct( $id, $title, $content = false ) { + public function __construct( $id, $title, $c = false ) { if ( is_string( $title ) ) { $nt = Title::newFromText( $title ); } else { @@ -70,17 +65,13 @@ class SearchUpdate implements DeferrableUpdate { if ( $nt ) { $this->id = $id; - // @todo This isn't ideal, we'd really like to have content-specific - // handling here. See similar content in SearchEngine::initText(). - if( is_string( $content ) ) { - // b/c for ApprovedRevs - $this->text = $content; + // is_string() check is back-compat for ApprovedRevs + if( is_string( $c ) ) { + $this->content = new TextContent( $c ); } else { - $this->text = $content ? $content->getTextForSearchIndex() : false; + $this->content = $c ?: false; } - - $this->namespace = $nt->getNamespace(); - $this->title = $nt->getText(); # Discard namespace + $this->title = $nt; } else { wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); } @@ -99,21 +90,23 @@ class SearchUpdate implements DeferrableUpdate { wfProfileIn( __METHOD__ ); $search = SearchEngine::create(); - $normalTitle = $search->normalizeText( Title::indexTitle( $this->namespace, $this->title ) ); + $normalTitle = $search->normalizeText( + Title::indexTitle( $this->title->getNamespace(), $this->title->getText() ) ); if ( WikiPage::newFromId( $this->id ) === null ) { $search->delete( $this->id, $normalTitle ); wfProfileOut( __METHOD__ ); return; - } elseif ( $this->text === false ) { + } elseif ( $this->content === false ) { $search->updateTitle( $this->id, $normalTitle ); wfProfileOut( __METHOD__ ); return; } - $text = self::updateText( $this->text ); - - wfRunHooks( 'SearchUpdate', array( $this->id, $this->namespace, $this->title, &$text ) ); + $text = $search->getTextFromContent( $this->title, $this->content ); + if( wfRunHooks( 'SearchUpdate', array( $this->id, $this->title, &$text, $this->content ) ) ) { + $text = self::updateText( $text ); + } # Perform the actual update $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) ); -- 2.20.1