From 798cfc5ade4d7706173b3162adfca77bad560465 Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Thu, 29 Oct 2009 16:19:35 +0000 Subject: [PATCH] Merge SQLite search from branches/sqlite/ to trunk --- config/Installer.php | 6 +- includes/AutoLoader.php | 2 + includes/db/DatabaseSqlite.php | 22 +- includes/search/SearchSqlite.php | 383 ++++++++++++++++++ maintenance/rebuildtextindex.php | 107 +++-- .../sqlite/archives/searchindex-fts3.sql | 18 + .../sqlite/archives/searchindex-no-fts.sql | 25 ++ maintenance/updaters.inc | 20 + 8 files changed, 534 insertions(+), 49 deletions(-) create mode 100644 includes/search/SearchSqlite.php create mode 100644 maintenance/sqlite/archives/searchindex-fts3.sql create mode 100644 maintenance/sqlite/archives/searchindex-no-fts.sql diff --git a/config/Installer.php b/config/Installer.php index 35cd63f781..90c293889b 100644 --- a/config/Installer.php +++ b/config/Installer.php @@ -1268,7 +1268,7 @@ if( $conf->posted && ( 0 == count( $errs ) ) ) { print " done.\n"; - if ($conf->DBtype == 'ibm_db2') { + if ( $conf->DBtype == 'ibm_db2' ) { // Now that table creation is done, make sure everything is committed // Do this before doing inserts through API if ($wgDatabase->lastError()) { @@ -1279,6 +1279,10 @@ if( $conf->posted && ( 0 == count( $errs ) ) ) { print "
  • MediaWiki tables successfully created
  • \n"; $wgDatabase->commit(); } + } elseif ( $conf->DBtype == 'sqlite' ) { + // Ensure proper searchindex format. We have to do that separately because + // if SQLite is compiled without the FTS3 module, table creation syntax will be invalid. + sqlite_setup_searchindex(); } print "
  • Initializing statistics...
  • \n"; diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index 6ca9ee4fb2..d79e87d23b 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -483,8 +483,10 @@ $wgAutoloadLocalClasses = array( 'SearchResult' => 'includes/search/SearchEngine.php', 'SearchResultSet' => 'includes/search/SearchEngine.php', 'SearchResultTooMany' => 'includes/search/SearchEngine.php', + 'SearchSqlite' => 'includes/search/SearchSqlite.php', 'SearchUpdate' => 'includes/search/SearchUpdate.php', 'SearchUpdateMyISAM' => 'includes/search/SearchUpdate.php', + 'SqliteSearchResultSet' => 'includes/search/SearchSqlite.php', # includes/specials 'SpecialAllmessages' => 'includes/specials/SpecialAllmessages.php', diff --git a/includes/db/DatabaseSqlite.php b/includes/db/DatabaseSqlite.php index 138e9ca25e..23a2473d98 100644 --- a/includes/db/DatabaseSqlite.php +++ b/includes/db/DatabaseSqlite.php @@ -96,6 +96,20 @@ class DatabaseSqlite extends DatabaseBase { return "$dir/$dbName.sqlite"; } + /** + * Returns version of currently supported SQLite fulltext search module or false if none present. + * @return String + */ + function getFulltextSearchModule() { + $table = 'dummy_search_test'; + $this->query( "DROP TABLE IF EXISTS $table", __METHOD__ ); + if ( $this->query( "CREATE VIRTUAL TABLE $table USING FTS3(dummy_field)", __METHOD__, true ) ) { + $this->query( "DROP TABLE IF EXISTS $table", __METHOD__ ); + return 'FTS3'; + } + return false; + } + /** * SQLite doesn't allow buffered results or data seeking etc, so we'll use fetchAll as the result */ @@ -319,15 +333,15 @@ class DatabaseSqlite extends DatabaseBase { } function wasDeadlock() { - return $this->lastErrno() == SQLITE_BUSY; + return $this->lastErrno() == 5; // SQLITE_BUSY } function wasErrorReissuable() { - return $this->lastErrno() == SQLITE_SCHEMA; + return $this->lastErrno() == 17; // SQLITE_SCHEMA; } function wasReadOnlyError() { - return $this->lastErrno() == SQLITE_READONLY; + return $this->lastErrno() == 8; // SQLITE_READONLY; } /** @@ -460,7 +474,7 @@ class DatabaseSqlite extends DatabaseBase { } public function getSearchEngine() { - return "SearchEngineDummy"; + return "SearchSqlite"; } /** diff --git a/includes/search/SearchSqlite.php b/includes/search/SearchSqlite.php new file mode 100644 index 0000000000..c49eb2628b --- /dev/null +++ b/includes/search/SearchSqlite.php @@ -0,0 +1,383 @@ +db = $db; + } + + /** + * Whether fulltext search is supported by current schema + * @return Boolean + */ + function fulltextSearchSupported() { + if ( self::$fulltextSupported === NULL ) { + $res = $this->db->selectField( 'updatelog', 'ul_key', array( 'ul_key' => 'fts3' ), __METHOD__ ); + self::$fulltextSupported = $res && $this->db->numRows( $res ) > 0; + } + wfDebug( "*************************************************************" . self::$fulltextSupported . "****************\n" ); + return self::$fulltextSupported; + } + + /** + * Parse the user's query and transform it into an SQL fragment which will + * become part of a WHERE clause + */ + function parseQuery( $filteredText, $fulltext ) { + global $wgContLang; + $lc = SearchEngine::legalSearchChars(); // Minus format chars + $searchon = ''; + $this->searchTerms = array(); + + # FIXME: This doesn't handle parenthetical expressions. + $m = array(); + if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $filteredText, $m, PREG_SET_ORDER ) ) { + foreach( $m as $bits ) { + @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; + + if( $nonQuoted != '' ) { + $term = $nonQuoted; + $quote = ''; + } else { + $term = str_replace( '"', '', $term ); + $quote = '"'; + } + + if( $searchon !== '' ) $searchon .= ' '; + if( $this->strictMatching && ($modifier == '') ) { + // If we leave this out, boolean op defaults to OR which is rarely helpful. + $modifier = '+'; + } + + // Some languages such as Serbian store the input form in the search index, + // so we may need to search for matches in multiple writing system variants. + $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); + if( is_array( $convertedVariants ) ) { + $variants = array_unique( array_values( $convertedVariants ) ); + } else { + $variants = array( $term ); + } + + // The low-level search index does some processing on input to work + // around problems with minimum lengths and encoding in MySQL's + // fulltext engine. + // For Chinese this also inserts spaces between adjacent Han characters. + $strippedVariants = array_map( + array( $wgContLang, 'stripForSearch' ), + $variants ); + + // Some languages such as Chinese force all variants to a canonical + // form when stripping to the low-level search index, so to be sure + // let's check our variants list for unique items after stripping. + $strippedVariants = array_unique( $strippedVariants ); + + $searchon .= $modifier; + if( count( $strippedVariants) > 1 ) + $searchon .= '('; + foreach( $strippedVariants as $stripped ) { + if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { + // Hack for Chinese: we need to toss in quotes for + // multiple-character phrases since stripForSearch() + // added spaces between them to make word breaks. + $stripped = '"' . trim( $stripped ) . '"'; + } + $searchon .= "$quote$stripped$quote$wildcard "; + } + if( count( $strippedVariants) > 1 ) + $searchon .= ')'; + + // Match individual terms or quoted phrase in result highlighting... + // Note that variants will be introduced in a later stage for highlighting! + $regexp = $this->regexTerm( $term, $wildcard ); + $this->searchTerms[] = $regexp; + } + wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); + wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); + } else { + wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); + } + + $searchon = $this->db->strencode( $searchon ); + $field = $this->getIndexField( $fulltext ); + return " $field MATCH '$searchon' "; + } + + function regexTerm( $string, $wildcard ) { + global $wgContLang; + + $regex = preg_quote( $string, '/' ); + if( $wgContLang->hasWordBreaks() ) { + if( $wildcard ) { + // Don't cut off the final bit! + $regex = "\b$regex"; + } else { + $regex = "\b$regex\b"; + } + } else { + // For Chinese, words may legitimately abut other words in the text literal. + // Don't add \b boundary checks... note this could cause false positives + // for latin chars. + } + return $regex; + } + + public static function legalSearchChars() { + return "\"*" . parent::legalSearchChars(); + } + + /** + * Perform a full text search query and return a result set. + * + * @param $term String: raw search term + * @return SqliteSearchResultSet + */ + function searchText( $term ) { + return $this->searchInternal( $term, true ); + } + + /** + * Perform a title-only search query and return a result set. + * + * @param $term String: raw search term + * @return SqliteSearchResultSet + */ + function searchTitle( $term ) { + return $this->searchInternal( $term, false ); + } + + protected function searchInternal( $term, $fulltext ) { + global $wgSearchMySQLTotalHits; + + if ( !$this->fulltextSearchSupported() ) { + return null; + } + + $filteredTerm = $this->filter( $term ); + $resultSet = $this->db->query( $this->getQuery( $filteredTerm, $fulltext ) ); + + $total = null; + if( $wgSearchMySQLTotalHits ) { + $totalResult = $this->db->query( $this->getCountQuery( $filteredTerm, $fulltext ) ); + $row = $totalResult->fetchObject(); + if( $row ) { + $total = intval( $row->c ); + } + $totalResult->free(); + } + + return new SqliteSearchResultSet( $resultSet, $this->searchTerms, $total ); + } + + + /** + * Return a partial WHERE clause to exclude redirects, if so set + * @return String + */ + function queryRedirect() { + if( $this->showRedirects ) { + return ''; + } else { + return 'AND page_is_redirect=0'; + } + } + + /** + * Return a partial WHERE clause to limit the search to the given namespaces + * @return String + */ + function queryNamespaces() { + if( is_null($this->namespaces) ) + return ''; # search all + if ( !count( $this->namespaces ) ) { + $namespaces = '0'; + } else { + $namespaces = $this->db->makeList( $this->namespaces ); + } + return 'AND page_namespace IN (' . $namespaces . ')'; + } + + /** + * Returns a query with limit for number of results set. + * @param $sql String: + * @return String + */ + function limitResult( $sql ) { + return $this->db->limitResult( $sql, $this->limit, $this->offset ); + } + + /** + * Does not do anything for generic search engine + * subclasses may define this though + * @return String + */ + function queryRanking( $filteredTerm, $fulltext ) { + return ''; + } + + /** + * Construct the full SQL query to do the search. + * The guts shoulds be constructed in queryMain() + * @param $filteredTerm String + * @param $fulltext Boolean + */ + function getQuery( $filteredTerm, $fulltext ) { + return $this->limitResult( + $this->queryMain( $filteredTerm, $fulltext ) . ' ' . + $this->queryRedirect() . ' ' . + $this->queryNamespaces() . ' ' . + $this->queryRanking( $filteredTerm, $fulltext ) + ); + } + + /** + * Picks which field to index on, depending on what type of query. + * @param $fulltext Boolean + * @return String + */ + function getIndexField( $fulltext ) { + return $fulltext ? 'si_text' : 'si_title'; + } + + /** + * Get the base part of the search query. + * + * @param $filteredTerm String + * @param $fulltext Boolean + * @return String + */ + function queryMain( $filteredTerm, $fulltext ) { + $match = $this->parseQuery( $filteredTerm, $fulltext ); + $page = $this->db->tableName( 'page' ); + $searchindex = $this->db->tableName( 'searchindex' ); + return "SELECT $searchindex.rowid, page_namespace, page_title " . + "FROM $page,$searchindex " . + "WHERE page_id=$searchindex.rowid AND $match"; + } + + function getCountQuery( $filteredTerm, $fulltext ) { + $match = $this->parseQuery( $filteredTerm, $fulltext ); + $page = $this->db->tableName( 'page' ); + $searchindex = $this->db->tableName( 'searchindex' ); + return "SELECT COUNT(*) AS c " . + "FROM $page,$searchindex " . + "WHERE page_id=$searchindex.rowid AND $match" . + $this->queryRedirect() . ' ' . + $this->queryNamespaces(); + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update( $id, $title, $text ) { + if ( !$this->fulltextSearchSupported() ) { + return; + } + // @todo: find a method to do it in a single request, + // couldn't do it so far due to typelessness of FTS3 tables. + $dbw = wfGetDB( DB_MASTER ); + + $dbw->delete( 'searchindex', array( 'rowid' => $id ), __METHOD__ ); + + $dbw->insert( 'searchindex', + array( + 'rowid' => $id, + 'si_title' => $title, + 'si_text' => $text + ), __METHOD__ ); + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * + * @param $id Integer + * @param $title String + */ + function updateTitle( $id, $title ) { + if ( !$this->fulltextSearchSupported() ) { + return; + } + $dbw = wfGetDB( DB_MASTER ); + + $dbw->update( 'searchindex', + array( 'rowid' => $id ), + array( 'si_title' => $title ), + __METHOD__ ); + } +} + +/** + * @ingroup Search + */ +class SqliteSearchResultSet extends SearchResultSet { + function SqliteSearchResultSet( $resultSet, $terms, $totalHits=null ) { + $this->mResultSet = $resultSet; + $this->mTerms = $terms; + $this->mTotalHits = $totalHits; + } + + function termMatches() { + return $this->mTerms; + } + + function numRows() { + return $this->mResultSet->numRows(); + } + + function next() { + $row = $this->mResultSet->fetchObject(); + if( $row === false ) { + return false; + } else { + return new SearchResult( $row ); + } + } + + function free() { + $this->mResultSet->free(); + } + + + function getTotalHits() { + return $this->mTotalHits; + } +} \ No newline at end of file diff --git a/maintenance/rebuildtextindex.php b/maintenance/rebuildtextindex.php index 4815e7a258..27ee24d566 100644 --- a/maintenance/rebuildtextindex.php +++ b/maintenance/rebuildtextindex.php @@ -1,9 +1,8 @@ error( "This script is only for MySQL.", true ); - } + global $wgTitle, $wgDBtype; + // Shouldn't be needed for Postgres + if ( $wgDBtype == 'postgres' ) { + $this->error( "This script does not work with PostgreSQL.\n", true ); + } + + $this->db = wfGetDB( DB_MASTER ); $wgTitle = Title::newFromText( "Rebuild text index script" ); - $this->dropTextIndex( $database ); - $this->doRebuildTextIndex( $database ); - $this->createTextIndex( $database ); + if ( $wgDBtype == 'mysql' ) { + $this->dropMysqlTextIndex(); + $this->populateSearchIndex(); + $this->createMysqlTextIndex(); + } else { + $this->clearSearchIndex(); + $this->populateSearchIndex(); + } $this->output( "Done.\n" ); } - - private function dropTextIndex( &$database ) { - $searchindex = $database->tableName( 'searchindex' ); - if ( $database->indexExists( "searchindex", "si_title" ) ) { - $this->output( "Dropping index...\n" ); - $sql = "ALTER TABLE $searchindex DROP INDEX si_title, DROP INDEX si_text"; - $database->query($sql, "dropTextIndex" ); - } - } - private function createTextIndex( &$database ) { - $searchindex = $database->tableName( 'searchindex' ); - $this->output( "\nRebuild the index...\n" ); - $sql = "ALTER TABLE $searchindex ADD FULLTEXT si_title (si_title), " . - "ADD FULLTEXT si_text (si_text)"; - $database->query($sql, "createTextIndex" ); - } - - private function doRebuildTextIndex( &$database ) { - list ($page, $revision, $text, $searchindex) = $database->tableNamesN( 'page', 'revision', 'text', 'searchindex' ); - - $sql = "SELECT MAX(page_id) AS count FROM $page"; - $res = $database->query($sql, "rebuildTextIndex" ); - $s = $database->fetchObject($res); + /** + * Populates the search index with content from all pages + */ + protected function populateSearchIndex() { + $res = $this->db->select( 'page', 'MAX(page_id) AS count' ); + $s = $this->db->fetchObject($res); $count = $s->count; $this->output( "Rebuilding index fields for {$count} pages...\n" ); $n = 0; @@ -88,22 +75,54 @@ class RebuildTextIndex extends Maintenance { while ( $n < $count ) { $this->output( $n . "\n" ); $end = $n + self::RTI_CHUNK_SIZE - 1; - $sql = "SELECT page_id, page_namespace, page_title, old_flags, old_text - FROM $page, $revision, $text - WHERE page_id BETWEEN $n AND $end - AND page_latest=rev_id - AND rev_text_id=old_id"; - $res = $database->query($sql, "rebuildTextIndex" ); + + $res = $this->db->select( array( 'page', 'revision', 'text' ), + array( 'page_id', 'page_namespace', 'page_title', 'old_flags', 'old_text' ), + array( "page_id BETWEEN $n AND $end", 'page_latest = rev_id', 'rev_text_id = old_id' ), + __METHOD__ + ); foreach( $res as $s ) { $revtext = Revision::getRevisionText( $s ); $u = new SearchUpdate( $s->page_id, $s->page_title, $revtext ); $u->doUpdate(); } - $database->freeResult( $res ); + $this->db->freeResult( $res ); $n += self::RTI_CHUNK_SIZE; } } + + /** + * (MySQL only) Drops fulltext index before populating the table. + */ + private function dropMysqlTextIndex() { + $searchindex = $this->db->tableName( 'searchindex' ); + if ( $this->db->indexExists( 'searchindex', 'si_title' ) ) { + $this->output( "Dropping index...\n" ); + $sql = "ALTER TABLE $searchindex DROP INDEX si_title, DROP INDEX si_text"; + $this->db->query($sql, __METHOD__ ); + } + } + + /** + * (MySQL only) Adds back fulltext index after populating the table. + */ + private function createMysqlTextIndex() { + $searchindex = $this->db->tableName( 'searchindex' ); + $this->output( "\nRebuild the index...\n" ); + $sql = "ALTER TABLE $searchindex ADD FULLTEXT si_title (si_title), " . + "ADD FULLTEXT si_text (si_text)"; + $this->db->query( $sql, __METHOD__ ); + } + + /** + * Deletes everything from search index. + */ + private function clearSearchIndex() { + $this->output( 'Clearing searchindex table...' ); + $this->db->delete( 'searchindex', '*', __METHOD__ ); + $this->output( "Done\n" ); + } } $maintClass = "RebuildTextIndex"; diff --git a/maintenance/sqlite/archives/searchindex-fts3.sql b/maintenance/sqlite/archives/searchindex-fts3.sql new file mode 100644 index 0000000000..3b1b279259 --- /dev/null +++ b/maintenance/sqlite/archives/searchindex-fts3.sql @@ -0,0 +1,18 @@ +-- Patch that introduces fulltext search capabilities to SQLite schema +-- Requires that SQLite must be compiled with FTS3 module (comes with core amalgamation). +-- See http://www.sqlite.org/cvstrac/wiki?p=FtsUsage for details of syntax. +-- Will fail if FTS3 is not present, +DROP TABLE IF EXISTS /*_*/searchindex; +CREATE VIRTUAL TABLE /*_*/searchindex USING FTS3( + -- Key to page_id + -- Disabled, instead we use the built-in rowid column + -- si_page INTEGER NOT NULL, + + -- Munged version of title + si_title, + + -- Munged version of body text + si_text +); + +INSERT INTO /*_*/updatelog VALUES ('fts3'); \ No newline at end of file diff --git a/maintenance/sqlite/archives/searchindex-no-fts.sql b/maintenance/sqlite/archives/searchindex-no-fts.sql new file mode 100644 index 0000000000..bc014b3d09 --- /dev/null +++ b/maintenance/sqlite/archives/searchindex-no-fts.sql @@ -0,0 +1,25 @@ +-- Searchindex table definition for cases when no full-text search SQLite module is present +-- (currently, only FTS3 is supported). +-- Use it if you are moving your database from environment with FTS support +-- to environment without it. + +DROP TABLE IF EXISTS /*_*/searchindex; + +-- These are pieces of FTS3-enabled searchindex +DROP TABLE IF EXISTS /*_*/searchindex_content; +DROP TABLE IF EXISTS /*_*/searchindex_segdir; +DROP TABLE IF EXISTS /*_*/searchindex_segments; + +CREATE TABLE /*_*/searchindex ( + -- Key to page_id + -- Disabled, instead we use the built-in rowid column + -- si_page INTEGER NOT NULL, + + -- Munged version of title + si_title TEXT, + + -- Munged version of body text + si_text TEXT +); + +DELETE FROM /*_*/updatelog WHERE ul_key='fts3'; \ No newline at end of file diff --git a/maintenance/updaters.inc b/maintenance/updaters.inc index 2fe3a53d60..40d9376b16 100644 --- a/maintenance/updaters.inc +++ b/maintenance/updaters.inc @@ -189,6 +189,9 @@ $wgUpdates = array( array( 'add_index', 'log_search', 'ls_field_val', 'patch-log_search-rename-index.sql' ), array( 'add_index', 'change_tag', 'change_tag_rc_tag', 'patch-change_tag-indexes.sql' ), array( 'add_field', 'redirect', 'rd_interwiki', 'patch-rd_interwiki.sql' ), + + // version-independent searchindex setup, added in 1.16 + array( 'sqlite_setup_searchindex' ), ), ); @@ -1281,6 +1284,23 @@ function sqlite_initial_indexes() { wfOut( "done\n" ); } +function sqlite_setup_searchindex() { + global $wgDatabase; + $module = $wgDatabase->getFulltextSearchModule(); + $fts3tTable = update_row_exists( 'fts3' ); + if ( $fts3tTable && !$module ) { + wfOut( '...PHP is missing FTS3 support, downgrading tables...' ); + $wgDatabase->sourceFile( archive( 'searchindex-no-fts.sql' ) ); + wfOut( "done\n" ); + } elseif ( !$fts3tTable && $module == 'FTS3' ) { + wfOut( '...adding FTS3 search capabilities...' ); + $wgDatabase->sourceFile( archive( 'searchindex-fts3.sql' ) ); + wfOut( "done\n" ); + } else { + wfOut( "...fulltext search table appears to be in order.\n" ); + } +} + function do_unique_pl_tl_il() { global $wgDatabase; $info = $wgDatabase->indexInfo( 'pagelinks', 'pl_namespace' ); -- 2.20.1