From: Chad Horohoe Date: Thu, 30 Jul 2009 22:45:38 +0000 (+0000) Subject: Move search related includes to includes/search/ X-Git-Tag: 1.31.0-rc.0~40624 X-Git-Url: http://git.cyclocoop.org/?a=commitdiff_plain;h=719644292735f1a4568ce9a3319fe082cf1c7477;hp=197d8f265ed96d3931f4e81ddba38b5e34dccd1e;p=lhc%2Fweb%2Fwiklou.git Move search related includes to includes/search/ --- diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index a5f067fca3..141125dbc5 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -151,10 +151,8 @@ $wgAutoloadLocalClasses = array( 'MimeMagic' => 'includes/MimeMagic.php', 'MWException' => 'includes/Exception.php', 'MWNamespace' => 'includes/Namespace.php', - 'MySQLSearchResultSet' => 'includes/SearchMySQL.php', 'Namespace' => 'includes/NamespaceCompat.php', // Compat 'OldChangesList' => 'includes/ChangesList.php', - 'OracleSearchResultSet' => 'includes/SearchOracle.php', 'OutputPage' => 'includes/OutputPage.php', 'PageHistory' => 'includes/PageHistory.php', 'PageHistoryPager' => 'includes/PageHistory.php', @@ -164,8 +162,6 @@ $wgAutoloadLocalClasses = array( 'PatrolLog' => 'includes/PatrolLog.php', 'PoolCounter' => 'includes/PoolCounter.php', 'PoolCounter_Stub' => 'includes/PoolCounter.php', - 'PostgresSearchResult' => 'includes/SearchPostgres.php', - 'PostgresSearchResultSet' => 'includes/SearchPostgres.php', 'Preferences' => 'includes/Preferences.php', 'PrefixSearch' => 'includes/PrefixSearch.php', 'Profiler' => 'includes/Profiler.php', @@ -188,18 +184,6 @@ $wgAutoloadLocalClasses = array( 'Revision' => 'includes/Revision.php', 'RSSFeed' => 'includes/Feed.php', 'Sanitizer' => 'includes/Sanitizer.php', - 'SearchEngineDummy' => 'includes/SearchEngine.php', - 'SearchEngine' => 'includes/SearchEngine.php', - 'SearchHighlighter' => 'includes/SearchEngine.php', - 'SearchMySQL4' => 'includes/SearchMySQL4.php', - 'SearchMySQL' => 'includes/SearchMySQL.php', - 'SearchOracle' => 'includes/SearchOracle.php', - 'SearchPostgres' => 'includes/SearchPostgres.php', - 'SearchResult' => 'includes/SearchEngine.php', - 'SearchResultSet' => 'includes/SearchEngine.php', - 'SearchResultTooMany' => 'includes/SearchEngine.php', - 'SearchUpdate' => 'includes/SearchUpdate.php', - 'SearchUpdateMyISAM' => 'includes/SearchUpdate.php', 'SiteConfiguration' => 'includes/SiteConfiguration.php', 'SiteStats' => 'includes/SiteStats.php', 'SiteStatsUpdate' => 'includes/SiteStats.php', @@ -364,11 +348,8 @@ $wgAutoloadLocalClasses = array( 'PostgresField' => 'includes/db/DatabasePostgres.php', 'ResultWrapper' => 'includes/db/Database.php', 'SQLiteField' => 'includes/db/DatabaseSqlite.php', - 'DatabaseIbm_db2' => 'includes/db/DatabaseIbm_db2.php', 'IBM_DB2Field' => 'includes/db/DatabaseIbm_db2.php', - 'IBM_DB2SearchResultSet' => 'includes/SearchIBM_DB2.php', - 'SearchIBM_DB2' => 'includes/SearchIBM_DB2.php', # includes/diff 'AncestorComparator' => 'includes/diff/HTMLDiff.php', @@ -482,6 +463,26 @@ $wgAutoloadLocalClasses = array( 'StripState' => 'includes/parser/Parser.php', 'MWTidy' => 'includes/parser/Tidy.php', + # includes/search + 'IBM_DB2SearchResultSet' => 'includes/search/SearchIBM_DB2.php', + 'MySQLSearchResultSet' => 'includes/search/SearchMySQL.php', + 'OracleSearchResultSet' => 'includes/search/SearchOracle.php', + 'PostgresSearchResult' => 'includes/search/SearchPostgres.php', + 'PostgresSearchResultSet' => 'includes/search/SearchPostgres.php', + 'SearchEngineDummy' => 'includes/search/SearchEngine.php', + 'SearchEngine' => 'includes/search/SearchEngine.php', + 'SearchHighlighter' => 'includes/search/SearchEngine.php', + 'SearchIBM_DB2' => 'includes/search/SearchIBM_DB2.php', + 'SearchMySQL4' => 'includes/search/SearchMySQL4.php', + 'SearchMySQL' => 'includes/search/SearchMySQL.php', + 'SearchOracle' => 'includes/search/SearchOracle.php', + 'SearchPostgres' => 'includes/search/SearchPostgres.php', + 'SearchResult' => 'includes/search/SearchEngine.php', + 'SearchResultSet' => 'includes/search/SearchEngine.php', + 'SearchResultTooMany' => 'includes/search/SearchEngine.php', + 'SearchUpdate' => 'include/searchs/SearchUpdate.php', + 'SearchUpdateMyISAM' => 'includes/search/SearchUpdate.php', + # includes/specials 'SpecialAllmessages' => 'includes/specials/SpecialAllmessages.php', 'AncientPagesPage' => 'includes/specials/SpecialAncientpages.php', diff --git a/includes/SearchEngine.php b/includes/SearchEngine.php deleted file mode 100644 index aab14cbba1..0000000000 --- a/includes/SearchEngine.php +++ /dev/null @@ -1,1193 +0,0 @@ - test prefix:Main Page/Archive - */ - function transformSearchTerm( $term ) { - return $term; - } - - /** - * If an exact title match can be find, or a very slightly close match, - * return the title. If no match, returns NULL. - * - * @param $searchterm String - * @return Title - */ - public static function getNearMatch( $searchterm ) { - global $wgContLang, $wgSecondaryGoNamespaces; - - $allSearchTerms = array($searchterm); - - if($wgContLang->hasVariants()){ - $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm)); - } - - foreach($allSearchTerms as $term){ - - # Exact match? No need to look further. - $title = Title::newFromText( $term ); - if (is_null($title)) - return NULL; - - if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) { - return $title; - } - - # See if it still otherwise has content is some sane sense - $article = MediaWiki::articleFromTitle( $title ); - if( $article->hasViewableContent() ) { - return $title; - } - - # If a match is not found in the main namespace look in secondary go namespaces. - if( $wgSecondaryGoNamespaces && $title->getNamespace() == NS_MAIN ) { - foreach( $wgSecondaryGoNamespaces as $ns ) { - $title = Title::newFromText( $term, $ns ); - if( $title && $title->exists() ) return $title; - } - } - - # Now try all lower case (i.e. first letter capitalized) - # - $title = Title::newFromText( $wgContLang->lc( $term ) ); - if ( $title && $title->exists() ) { - return $title; - } - - # Now try capitalized string - # - $title = Title::newFromText( $wgContLang->ucwords( $term ) ); - if ( $title && $title->exists() ) { - return $title; - } - - # Now try all upper case - # - $title = Title::newFromText( $wgContLang->uc( $term ) ); - if ( $title && $title->exists() ) { - return $title; - } - - # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc - $title = Title::newFromText( $wgContLang->ucwordbreaks($term) ); - if ( $title && $title->exists() ) { - return $title; - } - - // Give hooks a chance at better match variants - $title = null; - if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) { - return $title; - } - } - - $title = Title::newFromText( $searchterm ); - - # Entering an IP address goes to the contributions page - if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) ) - || User::isIP( trim( $searchterm ) ) ) { - return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() ); - } - - - # Entering a user goes to the user page whether it's there or not - if ( $title->getNamespace() == NS_USER ) { - return $title; - } - - # Go to images that exist even if there's no local page. - # There may have been a funny upload, or it may be on a shared - # file repository such as Wikimedia Commons. - if( $title->getNamespace() == NS_FILE ) { - $image = wfFindFile( $title ); - if( $image ) { - return $title; - } - } - - # MediaWiki namespace? Page may be "implied" if not customized. - # Just return it, with caps forced as the message system likes it. - if( $title->getNamespace() == NS_MEDIAWIKI ) { - return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) ); - } - - # Quoted term? Try without the quotes... - $matches = array(); - if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) { - return SearchEngine::getNearMatch( $matches[1] ); - } - - return NULL; - } - - public static function legalSearchChars() { - return "A-Za-z_'.0-9\\x80-\\xFF\\-"; - } - - /** - * Set the maximum number of results to return - * and how many to skip before returning the first. - * - * @param $limit Integer - * @param $offset Integer - */ - function setLimitOffset( $limit, $offset = 0 ) { - $this->limit = intval( $limit ); - $this->offset = intval( $offset ); - } - - /** - * Set which namespaces the search should include. - * Give an array of namespace index numbers. - * - * @param $namespaces Array - */ - function setNamespaces( $namespaces ) { - $this->namespaces = $namespaces; - } - - /** - * Parse some common prefixes: all (search everything) - * or namespace names - * - * @param $query String - */ - function replacePrefixes( $query ){ - global $wgContLang; - - if( strpos($query,':') === false ) - return $query; // nothing to do - - $parsed = $query; - $allkeyword = wfMsgForContent('searchall').":"; - if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){ - $this->namespaces = null; - $parsed = substr($query,strlen($allkeyword)); - } else if( strpos($query,':') !== false ) { - $prefix = substr($query,0,strpos($query,':')); - $index = $wgContLang->getNsIndex($prefix); - if($index !== false){ - $this->namespaces = array($index); - $parsed = substr($query,strlen($prefix)+1); - } - } - if(trim($parsed) == '') - return $query; // prefix was the whole query - - return $parsed; - } - - /** - * Make a list of searchable namespaces and their canonical names. - * @return Array - */ - public static function searchableNamespaces() { - global $wgContLang; - $arr = array(); - foreach( $wgContLang->getNamespaces() as $ns => $name ) { - if( $ns >= NS_MAIN ) { - $arr[$ns] = $name; - } - } - return $arr; - } - - /** - * Extract default namespaces to search from the given user's - * settings, returning a list of index numbers. - * - * @param $user User - * @return Array - */ - public static function userNamespaces( $user ) { - global $wgSearchEverythingOnlyLoggedIn; - - // get search everything preference, that can be set to be read for logged-in users - $searcheverything = false; - if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() ) - || !$wgSearchEverythingOnlyLoggedIn ) - $searcheverything = $user->getOption('searcheverything'); - - // searcheverything overrides other options - if( $searcheverything ) - return array_keys(SearchEngine::searchableNamespaces()); - - $arr = Preferences::loadOldSearchNs( $user ); - $searchableNamespaces = SearchEngine::searchableNamespaces(); - - $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter - - return $arr; - } - - /** - * Find snippet highlight settings for a given user - * - * @param $user User - * @return Array contextlines, contextchars - */ - public static function userHighlightPrefs( &$user ){ - //$contextlines = $user->getOption( 'contextlines', 5 ); - //$contextchars = $user->getOption( 'contextchars', 50 ); - $contextlines = 2; // Hardcode this. Old defaults sucked. :) - $contextchars = 75; // same as above.... :P - return array($contextlines, $contextchars); - } - - /** - * An array of namespaces indexes to be searched by default - * - * @return Array - */ - public static function defaultNamespaces(){ - global $wgNamespacesToBeSearchedDefault; - - return array_keys($wgNamespacesToBeSearchedDefault, true); - } - - /** - * Get a list of namespace names useful for showing in tooltips - * and preferences - * - * @param $namespaces Array - */ - public static function namespacesAsText( $namespaces ){ - global $wgContLang; - - $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces ); - foreach( $formatted as $key => $ns ){ - if ( empty($ns) ) - $formatted[$key] = wfMsg( 'blanknamespace' ); - } - return $formatted; - } - - /** - * Return the help namespaces to be shown on Special:Search - * - * @return Array - */ - public static function helpNamespaces() { - global $wgNamespacesToBeSearchedHelp; - - return array_keys( $wgNamespacesToBeSearchedHelp, true ); - } - - /** - * Return a 'cleaned up' search string - * - * @param $text String - * @return String - */ - function filter( $text ) { - $lc = $this->legalSearchChars(); - return trim( preg_replace( "/[^{$lc}]/", " ", $text ) ); - } - /** - * Load up the appropriate search engine class for the currently - * active database backend, and return a configured instance. - * - * @return SearchEngine - */ - public static function create() { - global $wgSearchType; - $dbr = wfGetDB( DB_SLAVE ); - if( $wgSearchType ) { - $class = $wgSearchType; - } else { - $class = $dbr->getSearchEngine(); - } - $search = new $class( $dbr ); - $search->setLimitOffset(0,0); - return $search; - } - - /** - * Create or update the search index record for the given page. - * Title and text should be pre-processed. - * STUB - * - * @param $id Integer - * @param $title String - * @param $text String - */ - function update( $id, $title, $text ) { - // no-op - } - - /** - * Update a search index record's title only. - * Title should be pre-processed. - * STUB - * - * @param $id Integer - * @param $title String - */ - function updateTitle( $id, $title ) { - // no-op - } - - /** - * Get OpenSearch suggestion template - * - * @return String - */ - public static function getOpenSearchTemplate() { - global $wgOpenSearchTemplate, $wgServer, $wgScriptPath; - if( $wgOpenSearchTemplate ) { - return $wgOpenSearchTemplate; - } else { - $ns = implode( '|', SearchEngine::defaultNamespaces() ); - if( !$ns ) $ns = "0"; - return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns; - } - } - - /** - * Get internal MediaWiki Suggest template - * - * @return String - */ - public static function getMWSuggestTemplate() { - global $wgMWSuggestTemplate, $wgServer, $wgScriptPath; - if($wgMWSuggestTemplate) - return $wgMWSuggestTemplate; - else - return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest'; - } -} - -/** - * @ingroup Search - */ -class SearchResultSet { - /** - * Fetch an array of regular expression fragments for matching - * the search terms as parsed by this engine in a text extract. - * STUB - * - * @return Array - */ - function termMatches() { - return array(); - } - - function numRows() { - return 0; - } - - /** - * Return true if results are included in this result set. - * STUB - * - * @return Boolean - */ - function hasResults() { - return false; - } - - /** - * Some search modes return a total hit count for the query - * in the entire article database. This may include pages - * in namespaces that would not be matched on the given - * settings. - * - * Return null if no total hits number is supported. - * - * @return Integer - */ - function getTotalHits() { - return null; - } - - /** - * Some search modes return a suggested alternate term if there are - * no exact hits. Returns true if there is one on this set. - * - * @return Boolean - */ - function hasSuggestion() { - return false; - } - - /** - * @return String: suggested query, null if none - */ - function getSuggestionQuery(){ - return null; - } - - /** - * @return String: HTML highlighted suggested query, '' if none - */ - function getSuggestionSnippet(){ - return ''; - } - - /** - * Return information about how and from where the results were fetched, - * should be useful for diagnostics and debugging - * - * @return String - */ - function getInfo() { - return null; - } - - /** - * Return a result set of hits on other (multiple) wikis associated with this one - * - * @return SearchResultSet - */ - function getInterwikiResults() { - return null; - } - - /** - * Check if there are results on other wikis - * - * @return Boolean - */ - function hasInterwikiResults() { - return $this->getInterwikiResults() != null; - } - - - /** - * Fetches next search result, or false. - * STUB - * - * @return SearchResult - */ - function next() { - return false; - } - - /** - * Frees the result set, if applicable. - */ - function free() { - // ... - } -} - - -/** - * @ingroup Search - */ -class SearchResultTooMany { - ## Some search engines may bail out if too many matches are found -} - - -/** - * @todo Fixme: This class is horribly factored. It would probably be better to - * have a useful base class to which you pass some standard information, then - * let the fancy self-highlighters extend that. - * @ingroup Search - */ -class SearchResult { - var $mRevision = null; - var $mImage = null; - - function __construct( $row ) { - $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title ); - if( !is_null($this->mTitle) ){ - $this->mRevision = Revision::newFromTitle( $this->mTitle ); - if( $this->mTitle->getNamespace() === NS_FILE ) - $this->mImage = wfFindFile( $this->mTitle ); - } - } - - /** - * Check if this is result points to an invalid title - * - * @return Boolean - */ - function isBrokenTitle(){ - if( is_null($this->mTitle) ) - return true; - return false; - } - - /** - * Check if target page is missing, happens when index is out of date - * - * @return Boolean - */ - function isMissingRevision(){ - return !$this->mRevision && !$this->mImage; - } - - /** - * @return Title - */ - function getTitle() { - return $this->mTitle; - } - - /** - * @return Double or null if not supported - */ - function getScore() { - return null; - } - - /** - * Lazy initialization of article text from DB - */ - protected function initText(){ - if( !isset($this->mText) ){ - if($this->mRevision != null) - $this->mText = $this->mRevision->getText(); - else // TODO: can we fetch raw wikitext for commons images? - $this->mText = ''; - - } - } - - /** - * @param $terms Array: terms to highlight - * @return String: highlighted text snippet, null (and not '') if not supported - */ - function getTextSnippet($terms){ - global $wgUser, $wgAdvancedSearchHighlighting; - $this->initText(); - list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser); - $h = new SearchHighlighter(); - if( $wgAdvancedSearchHighlighting ) - return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars ); - else - return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars ); - } - - /** - * @param $terms Array: terms to highlight - * @return String: highlighted title, '' if not supported - */ - function getTitleSnippet($terms){ - return ''; - } - - /** - * @param $terms Array: terms to highlight - * @return String: highlighted redirect name (redirect to this page), '' if none or not supported - */ - function getRedirectSnippet($terms){ - return ''; - } - - /** - * @return Title object for the redirect to this page, null if none or not supported - */ - function getRedirectTitle(){ - return null; - } - - /** - * @return string highlighted relevant section name, null if none or not supported - */ - function getSectionSnippet(){ - return ''; - } - - /** - * @return Title object (pagename+fragment) for the section, null if none or not supported - */ - function getSectionTitle(){ - return null; - } - - /** - * @return String: timestamp - */ - function getTimestamp(){ - if( $this->mRevision ) - return $this->mRevision->getTimestamp(); - else if( $this->mImage ) - return $this->mImage->getTimestamp(); - return ''; - } - - /** - * @return Integer: number of words - */ - function getWordCount(){ - $this->initText(); - return str_word_count( $this->mText ); - } - - /** - * @return Integer: size in bytes - */ - function getByteSize(){ - $this->initText(); - return strlen( $this->mText ); - } - - /** - * @return Boolean if hit has related articles - */ - function hasRelated(){ - return false; - } - - /** - * @return String: interwiki prefix of the title (return iw even if title is broken) - */ - function getInterwikiPrefix(){ - return ''; - } -} - -/** - * Highlight bits of wikitext - * - * @ingroup Search - */ -class SearchHighlighter { - var $mCleanWikitext = true; - - function SearchHighlighter($cleanupWikitext = true){ - $this->mCleanWikitext = $cleanupWikitext; - } - - /** - * Default implementation of wikitext highlighting - * - * @param $text String - * @param $terms Array: terms to highlight (unescaped) - * @param $contextlines Integer - * @param $contextchars Integer - * @return String - */ - public function highlightText( $text, $terms, $contextlines, $contextchars ) { - global $wgLang, $wgContLang; - global $wgSearchHighlightBoundaries; - $fname = __METHOD__; - - if($text == '') - return ''; - - // spli text into text + templates/links/tables - $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; - // first capture group is for detecting nested templates/links/tables/references - $endPatterns = array( - 1 => '/(\{\{)|(\}\})/', // template - 2 => '/(\[\[)|(\]\])/', // image - 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table - - // FIXME: this should prolly be a hook or something - if(function_exists('wfCite')){ - $spat .= '|()'; // references via cite extension - $endPatterns[4] = '/()|(<\/ref>)/'; - } - $spat .= '/'; - $textExt = array(); // text extracts - $otherExt = array(); // other extracts - wfProfileIn( "$fname-split" ); - $start = 0; - $textLen = strlen($text); - $count = 0; // sequence number to maintain ordering - while( $start < $textLen ){ - // find start of template/image/table - if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){ - $epat = ''; - foreach($matches as $key => $val){ - if($key > 0 && $val[1] != -1){ - if($key == 2){ - // see if this is an image link - $ns = substr($val[0],2,-1); - if( $wgContLang->getNsIndex($ns) != NS_FILE ) - break; - - } - $epat = $endPatterns[$key]; - $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); - $start = $val[1]; - break; - } - } - if( $epat ){ - // find end (and detect any nested elements) - $level = 0; - $offset = $start + 1; - $found = false; - while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){ - if( array_key_exists(2,$endMatches) ){ - // found end - if($level == 0){ - $len = strlen($endMatches[2][0]); - $off = $endMatches[2][1]; - $this->splitAndAdd( $otherExt, $count, - substr( $text, $start, $off + $len - $start ) ); - $start = $off + $len; - $found = true; - break; - } else{ - // end of nested element - $level -= 1; - } - } else{ - // nested - $level += 1; - } - $offset = $endMatches[0][1] + strlen($endMatches[0][0]); - } - if( ! $found ){ - // couldn't find appropriate closing tag, skip - $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) ); - $start += strlen($matches[0][0]); - } - continue; - } - } - // else: add as text extract - $this->splitAndAdd( $textExt, $count, substr($text,$start) ); - break; - } - - $all = $textExt + $otherExt; // these have disjunct key sets - - wfProfileOut( "$fname-split" ); - - // prepare regexps - foreach( $terms as $index => $term ) { - // manually do upper/lowercase stuff for utf-8 since PHP won't do it - if(preg_match('/[\x80-\xff]/', $term) ){ - $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]); - } else { - $terms[$index] = $term; - } - } - $anyterm = implode( '|', $terms ); - $phrase = implode("$wgSearchHighlightBoundaries+", $terms ); - - // FIXME: a hack to scale contextchars, a correct solution - // would be to have contextchars actually be char and not byte - // length, and do proper utf-8 substrings and lengths everywhere, - // but PHP is making that very hard and unclean to implement :( - $scale = strlen($anyterm) / mb_strlen($anyterm); - $contextchars = intval( $contextchars * $scale ); - - $patPre = "(^|$wgSearchHighlightBoundaries)"; - $patPost = "($wgSearchHighlightBoundaries|$)"; - - $pat1 = "/(".$phrase.")/ui"; - $pat2 = "/$patPre(".$anyterm.")$patPost/ui"; - - wfProfileIn( "$fname-extract" ); - - $left = $contextlines; - - $snippets = array(); - $offsets = array(); - - // show beginning only if it contains all words - $first = 0; - $firstText = ''; - foreach($textExt as $index => $line){ - if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){ - $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); - $first = $index; - break; - } - } - if( $firstText ){ - $succ = true; - // check if first text contains all terms - foreach($terms as $term){ - if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){ - $succ = false; - break; - } - } - if( $succ ){ - $snippets[$first] = $firstText; - $offsets[$first] = 0; - } - } - if( ! $snippets ) { - // match whole query on text - $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets); - // match whole query on templates/tables/images - $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets); - // match any words on text - $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets); - // match any words on templates/tables/images - $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets); - - ksort($snippets); - } - - // add extra chars to each snippet to make snippets constant size - $extended = array(); - if( count( $snippets ) == 0){ - // couldn't find the target words, just show beginning of article - $targetchars = $contextchars * $contextlines; - $snippets[$first] = ''; - $offsets[$first] = 0; - } else{ - // if begin of the article contains the whole phrase, show only that !! - if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first]) - && $offsets[$first] < $contextchars * 2 ){ - $snippets = array ($first => $snippets[$first]); - } - - // calc by how much to extend existing snippets - $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) ); - } - - foreach($snippets as $index => $line){ - $extended[$index] = $line; - $len = strlen($line); - if( $len < $targetchars - 20 ){ - // complete this line - if($len < strlen( $all[$index] )){ - $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]); - $len = strlen( $extended[$index] ); - } - - // add more lines - $add = $index + 1; - while( $len < $targetchars - 20 - && array_key_exists($add,$all) - && !array_key_exists($add,$snippets) ){ - $offsets[$add] = 0; - $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); - $extended[$add] = $tt; - $len += strlen( $tt ); - $add++; - } - } - } - - //$snippets = array_map('htmlspecialchars', $extended); - $snippets = $extended; - $last = -1; - $extract = ''; - foreach($snippets as $index => $line){ - if($last == -1) - $extract .= $line; // first line - elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last])) - $extract .= " ".$line; // continous lines - else - $extract .= ' ... ' . $line; - - $last = $index; - } - if( $extract ) - $extract .= ' ... '; - - $processed = array(); - foreach($terms as $term){ - if( ! isset($processed[$term]) ){ - $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word - $extract = preg_replace( $pat3, - "\\1\\2\\3", $extract ); - $processed[$term] = true; - } - } - - wfProfileOut( "$fname-extract" ); - - return $extract; - } - - /** - * Split text into lines and add it to extracts array - * - * @param $extracts Array: index -> $line - * @param $count Integer - * @param $text String - */ - function splitAndAdd(&$extracts, &$count, $text){ - $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text ); - foreach($split as $line){ - $tt = trim($line); - if( $tt ) - $extracts[$count++] = $tt; - } - } - - /** - * Do manual case conversion for non-ascii chars - * - * @param $matches Array - */ - function caseCallback($matches){ - global $wgContLang; - if( strlen($matches[0]) > 1 ){ - return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']'; - } else - return $matches[0]; - } - - /** - * Extract part of the text from start to end, but by - * not chopping up words - * @param $text String - * @param $start Integer - * @param $end Integer - * @param $posStart Integer: (out) actual start position - * @param $posEnd Integer: (out) actual end position - * @return String - */ - function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){ - global $wgContLang; - - if( $start != 0) - $start = $this->position( $text, $start, 1 ); - if( $end >= strlen($text) ) - $end = strlen($text); - else - $end = $this->position( $text, $end ); - - if(!is_null($posStart)) - $posStart = $start; - if(!is_null($posEnd)) - $posEnd = $end; - - if($end > $start) - return substr($text, $start, $end-$start); - else - return ''; - } - - /** - * Find a nonletter near a point (index) in the text - * - * @param $text String - * @param $point Integer - * @param $offset Integer: offset to found index - * @return Integer: nearest nonletter index, or beginning of utf8 char if none - */ - function position($text, $point, $offset=0 ){ - $tolerance = 10; - $s = max( 0, $point - $tolerance ); - $l = min( strlen($text), $point + $tolerance ) - $s; - $m = array(); - if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){ - return $m[0][1] + $s + $offset; - } else{ - // check if point is on a valid first UTF8 char - $char = ord( $text[$point] ); - while( $char >= 0x80 && $char < 0xc0 ) { - // skip trailing bytes - $point++; - if($point >= strlen($text)) - return strlen($text); - $char = ord( $text[$point] ); - } - return $point; - - } - } - - /** - * Search extracts for a pattern, and return snippets - * - * @param $pattern String: regexp for matching lines - * @param $extracts Array: extracts to search - * @param $linesleft Integer: number of extracts to make - * @param $contextchars Integer: length of snippet - * @param $out Array: map for highlighted snippets - * @param $offsets Array: map of starting points of snippets - * @protected - */ - function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){ - if($linesleft == 0) - return; // nothing to do - foreach($extracts as $index => $line){ - if( array_key_exists($index,$out) ) - continue; // this line already highlighted - - $m = array(); - if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) - continue; - - $offset = $m[0][1]; - $len = strlen($m[0][0]); - if($offset + $len < $contextchars) - $begin = 0; - elseif( $len > $contextchars) - $begin = $offset; - else - $begin = $offset + intval( ($len - $contextchars) / 2 ); - - $end = $begin + $contextchars; - - $posBegin = $begin; - // basic snippet from this line - $out[$index] = $this->extract($line,$begin,$end,$posBegin); - $offsets[$index] = $posBegin; - $linesleft--; - if($linesleft == 0) - return; - } - } - - /** - * Basic wikitext removal - * @protected - */ - function removeWiki($text) { - $fname = __METHOD__; - wfProfileIn( $fname ); - - //$text = preg_replace("/'{2,5}/", "", $text); - //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text); - //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text); - //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text); - //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text); - //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text); - $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text); - $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text); - $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text); - $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text); - //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); - $text = preg_replace("/<\/?[^>]+>/", "", $text); - $text = preg_replace("/'''''/", "", $text); - $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text); - $text = preg_replace("/''/", "", $text); - - wfProfileOut( $fname ); - return $text; - } - - /** - * callback to replace [[target|caption]] kind of links, if - * the target is category or image, leave it - * - * @param $matches Array - */ - function linkReplace($matches){ - $colon = strpos( $matches[1], ':' ); - if( $colon === false ) - return $matches[2]; // replace with caption - global $wgContLang; - $ns = substr( $matches[1], 0, $colon ); - $index = $wgContLang->getNsIndex($ns); - if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) ) - return $matches[0]; // return the whole thing - else - return $matches[2]; - - } - - /** - * Simple & fast snippet extraction, but gives completely unrelevant - * snippets - * - * @param $text String - * @param $terms Array - * @param $contextlines Integer - * @param $contextchars Integer - * @return String - */ - public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { - global $wgLang, $wgContLang; - $fname = __METHOD__; - - $lines = explode( "\n", $text ); - - $terms = implode( '|', $terms ); - $max = intval( $contextchars ) + 1; - $pat1 = "/(.*)($terms)(.{0,$max})/i"; - - $lineno = 0; - - $extract = ""; - wfProfileIn( "$fname-extract" ); - foreach ( $lines as $line ) { - if ( 0 == $contextlines ) { - break; - } - ++$lineno; - $m = array(); - if ( ! preg_match( $pat1, $line, $m ) ) { - continue; - } - --$contextlines; - $pre = $wgContLang->truncate( $m[1], -$contextchars ); - - if ( count( $m ) < 3 ) { - $post = ''; - } else { - $post = $wgContLang->truncate( $m[3], $contextchars ); - } - - $found = $m[2]; - - $line = htmlspecialchars( $pre . $found . $post ); - $pat2 = '/(' . $terms . ")/i"; - $line = preg_replace( $pat2, - "\\1", $line ); - - $extract .= "${line}\n"; - } - wfProfileOut( "$fname-extract" ); - - return $extract; - } - -} - -/** - * Dummy class to be used when non-supported Database engine is present. - * @todo Fixme: dummy class should probably try something at least mildly useful, - * such as a LIKE search through titles. - * @ingroup Search - */ -class SearchEngineDummy extends SearchEngine { - // no-op -} diff --git a/includes/SearchIBM_DB2.php b/includes/SearchIBM_DB2.php deleted file mode 100644 index b94a478236..0000000000 --- a/includes/SearchIBM_DB2.php +++ /dev/null @@ -1,249 +0,0 @@ - -# http://www.mediawiki.org/ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# http://www.gnu.org/copyleft/gpl.html - -/** - * @file - * @ingroup Search - */ - -/** - * Search engine hook base class for IBM DB2 - * @ingroup Search - */ -class SearchIBM_DB2 extends SearchEngine { - function __construct($db) { - $this->db = $db; - } - - /** - * Perform a full text search query and return a result set. - * - * @param $term String: raw search term - * @return IBM_DB2SearchResultSet - */ - function searchText( $term ) { - $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), true))); - return new IBM_DB2SearchResultSet($resultSet, $this->searchTerms); - } - - /** - * Perform a title-only search query and return a result set. - * - * @param $term String: taw search term - * @return IBM_DB2SearchResultSet - */ - function searchTitle($term) { - $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), false))); - return new MySQLSearchResultSet($resultSet, $this->searchTerms); - } - - - /** - * Return a partial WHERE clause to exclude redirects, if so set - * @return String - */ - function queryRedirect() { - if ($this->showRedirects) { - return ''; - } else { - return 'AND page_is_redirect=0'; - } - } - - /** - * Return a partial WHERE clause to limit the search to the given namespaces - * @return String - */ - function queryNamespaces() { - if( is_null($this->namespaces) ) - return ''; - $namespaces = implode(',', $this->namespaces); - if ($namespaces == '') { - $namespaces = '0'; - } - return 'AND page_namespace IN (' . $namespaces . ')'; - } - - /** - * Return a LIMIT clause to limit results on the query. - * @return String - */ - function queryLimit($sql) { - return $this->db->limitResult($sql, $this->limit, $this->offset); - } - - /** - * Does not do anything for generic search engine - * subclasses may define this though - * @return String - */ - function queryRanking($filteredTerm, $fulltext) { - // requires Net Search Extender or equivalent - // return ' ORDER BY score(1)'; - return ''; - } - - /** - * Construct the full SQL query to do the search. - * The guts shoulds be constructed in queryMain() - * @param string $filteredTerm String - * @param bool $fulltext Boolean - */ - function getQuery( $filteredTerm, $fulltext ) { - return $this->queryLimit($this->queryMain($filteredTerm, $fulltext) . ' ' . - $this->queryRedirect() . ' ' . - $this->queryNamespaces() . ' ' . - $this->queryRanking( $filteredTerm, $fulltext ) . ' '); - } - - - /** - * Picks which field to index on, depending on what type of query. - * @param $fulltext Boolean - * @return String - */ - function getIndexField($fulltext) { - return $fulltext ? 'si_text' : 'si_title'; - } - - /** - * Get the base part of the search query. - * - * @param string $filteredTerm String - * @param bool $fulltext Boolean - * @return String - */ - function queryMain( $filteredTerm, $fulltext ) { - $match = $this->parseQuery($filteredTerm, $fulltext); - $page = $this->db->tableName('page'); - $searchindex = $this->db->tableName('searchindex'); - return 'SELECT page_id, page_namespace, page_title ' . - "FROM $page,$searchindex " . - 'WHERE page_id=si_page AND ' . $match; - } - - /** @todo document */ - function parseQuery($filteredText, $fulltext) { - global $wgContLang; - $lc = SearchEngine::legalSearchChars(); - $this->searchTerms = array(); - - # FIXME: This doesn't handle parenthetical expressions. - $m = array(); - $q = array(); - - if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', - $filteredText, $m, PREG_SET_ORDER)) { - foreach($m as $terms) { - - // Search terms in all variant forms, only - // apply on wiki with LanguageConverter - $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] ); - if( is_array( $temp_terms )) { - $temp_terms = array_unique( array_values( $temp_terms )); - foreach( $temp_terms as $t ) - $q[] = $terms[1] . $wgContLang->stripForSearch( $t ); - } - else - $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] ); - - if (!empty($terms[3])) { - $regexp = preg_quote( $terms[3], '/' ); - if ($terms[4]) - $regexp .= "[0-9A-Za-z_]+"; - } else { - $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); - } - $this->searchTerms[] = $regexp; - } - } - - $searchon = $this->db->strencode(join(',', $q)); - $field = $this->getIndexField($fulltext); - - // requires Net Search Extender or equivalent - //return " CONTAINS($field, '$searchon') > 0 "; - - return " lcase($field) LIKE lcase('%$searchon%')"; - } - - /** - * Create or update the search index record for the given page. - * Title and text should be pre-processed. - * - * @param $id Integer - * @param $title String - * @param $text String - */ - function update($id, $title, $text) { - $dbw = wfGetDB(DB_MASTER); - $dbw->replace('searchindex', - array('si_page'), - array( - 'si_page' => $id, - 'si_title' => $title, - 'si_text' => $text - ), 'SearchIBM_DB2::update' ); - // ? - //$dbw->query("CALL ctx_ddl.sync_index('si_text_idx')"); - //$dbw->query("CALL ctx_ddl.sync_index('si_title_idx')"); - } - - /** - * Update a search index record's title only. - * Title should be pre-processed. - * - * @param $id Integer - * @param $title String - */ - function updateTitle($id, $title) { - $dbw = wfGetDB(DB_MASTER); - - $dbw->update('searchindex', - array('si_title' => $title), - array('si_page' => $id), - 'SearchIBM_DB2::updateTitle', - array()); - } -} - -/** - * @ingroup Search - */ -class IBM_DB2SearchResultSet extends SearchResultSet { - function __construct($resultSet, $terms) { - $this->mResultSet = $resultSet; - $this->mTerms = $terms; - } - - function termMatches() { - return $this->mTerms; - } - - function numRows() { - return $this->mResultSet->numRows(); - } - - function next() { - $row = $this->mResultSet->fetchObject(); - if ($row === false) - return false; - return new SearchResult($row); - } -} diff --git a/includes/SearchMySQL.php b/includes/SearchMySQL.php deleted file mode 100644 index a0ec92711a..0000000000 --- a/includes/SearchMySQL.php +++ /dev/null @@ -1,353 +0,0 @@ - -# http://www.mediawiki.org/ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# http://www.gnu.org/copyleft/gpl.html - -/** - * @file - * @ingroup Search - */ - -/** - * Search engine hook for MySQL 4+ - * @ingroup Search - */ -class SearchMySQL extends SearchEngine { - var $strictMatching = true; - - /** @todo document */ - function __construct( $db ) { - $this->db = $db; - } - - /** - * Parse the user's query and transform it into an SQL fragment which will - * become part of a WHERE clause - */ - function parseQuery( $filteredText, $fulltext ) { - global $wgContLang; - $lc = SearchEngine::legalSearchChars(); // Minus format chars - $searchon = ''; - $this->searchTerms = array(); - - # FIXME: This doesn't handle parenthetical expressions. - $m = array(); - if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', - $filteredText, $m, PREG_SET_ORDER ) ) { - foreach( $m as $bits ) { - @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; - - if( $nonQuoted != '' ) { - $term = $nonQuoted; - $quote = ''; - } else { - $term = str_replace( '"', '', $term ); - $quote = '"'; - } - - if( $searchon !== '' ) $searchon .= ' '; - if( $this->strictMatching && ($modifier == '') ) { - // If we leave this out, boolean op defaults to OR which is rarely helpful. - $modifier = '+'; - } - - // Some languages such as Serbian store the input form in the search index, - // so we may need to search for matches in multiple writing system variants. - $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); - if( is_array( $convertedVariants ) ) { - $variants = array_unique( array_values( $convertedVariants ) ); - } else { - $variants = array( $term ); - } - - // The low-level search index does some processing on input to work - // around problems with minimum lengths and encoding in MySQL's - // fulltext engine. - // For Chinese this also inserts spaces between adjacent Han characters. - $strippedVariants = array_map( - array( $wgContLang, 'stripForSearch' ), - $variants ); - - // Some languages such as Chinese force all variants to a canonical - // form when stripping to the low-level search index, so to be sure - // let's check our variants list for unique items after stripping. - $strippedVariants = array_unique( $strippedVariants ); - - $searchon .= $modifier; - if( count( $strippedVariants) > 1 ) - $searchon .= '('; - foreach( $strippedVariants as $stripped ) { - if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { - // Hack for Chinese: we need to toss in quotes for - // multiple-character phrases since stripForSearch() - // added spaces between them to make word breaks. - $stripped = '"' . trim( $stripped ) . '"'; - } - $searchon .= "$quote$stripped$quote$wildcard "; - } - if( count( $strippedVariants) > 1 ) - $searchon .= ')'; - - // Match individual terms or quoted phrase in result highlighting... - // Note that variants will be introduced in a later stage for highlighting! - $regexp = $this->regexTerm( $term, $wildcard ); - $this->searchTerms[] = $regexp; - } - wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); - wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); - } else { - wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); - } - - $searchon = $this->db->strencode( $searchon ); - $field = $this->getIndexField( $fulltext ); - return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) "; - } - - function regexTerm( $string, $wildcard ) { - global $wgContLang; - - $regex = preg_quote( $string, '/' ); - if( $wgContLang->hasWordBreaks() ) { - if( $wildcard ) { - // Don't cut off the final bit! - $regex = "\b$regex"; - } else { - $regex = "\b$regex\b"; - } - } else { - // For Chinese, words may legitimately abut other words in the text literal. - // Don't add \b boundary checks... note this could cause false positives - // for latin chars. - } - return $regex; - } - - public static function legalSearchChars() { - return "\"*" . parent::legalSearchChars(); - } - - /** - * Perform a full text search query and return a result set. - * - * @param $term String: raw search term - * @return MySQLSearchResultSet - */ - function searchText( $term ) { - return $this->searchInternal( $term, true ); - } - - /** - * Perform a title-only search query and return a result set. - * - * @param $term String: raw search term - * @return MySQLSearchResultSet - */ - function searchTitle( $term ) { - return $this->searchInternal( $term, false ); - } - - protected function searchInternal( $term, $fulltext ) { - global $wgSearchMySQLTotalHits; - - $filteredTerm = $this->filter( $term ); - $resultSet = $this->db->query( $this->getQuery( $filteredTerm, $fulltext ) ); - - $total = null; - if( $wgSearchMySQLTotalHits ) { - $totalResult = $this->db->query( $this->getCountQuery( $filteredTerm, $fulltext ) ); - $row = $totalResult->fetchObject(); - if( $row ) { - $total = intval( $row->c ); - } - $totalResult->free(); - } - - return new MySQLSearchResultSet( $resultSet, $this->searchTerms, $total ); - } - - - /** - * Return a partial WHERE clause to exclude redirects, if so set - * @return String - */ - function queryRedirect() { - if( $this->showRedirects ) { - return ''; - } else { - return 'AND page_is_redirect=0'; - } - } - - /** - * Return a partial WHERE clause to limit the search to the given namespaces - * @return String - */ - function queryNamespaces() { - if( is_null($this->namespaces) ) - return ''; # search all - if ( !count( $this->namespaces ) ) { - $namespaces = '0'; - } else { - $namespaces = $this->db->makeList( $this->namespaces ); - } - return 'AND page_namespace IN (' . $namespaces . ')'; - } - - /** - * Return a LIMIT clause to limit results on the query. - * @return String - */ - function queryLimit() { - return $this->db->limitResult( '', $this->limit, $this->offset ); - } - - /** - * Does not do anything for generic search engine - * subclasses may define this though - * @return String - */ - function queryRanking( $filteredTerm, $fulltext ) { - return ''; - } - - /** - * Construct the full SQL query to do the search. - * The guts shoulds be constructed in queryMain() - * @param $filteredTerm String - * @param $fulltext Boolean - */ - function getQuery( $filteredTerm, $fulltext ) { - return $this->queryMain( $filteredTerm, $fulltext ) . ' ' . - $this->queryRedirect() . ' ' . - $this->queryNamespaces() . ' ' . - $this->queryRanking( $filteredTerm, $fulltext ) . ' ' . - $this->queryLimit(); - } - - /** - * Picks which field to index on, depending on what type of query. - * @param $fulltext Boolean - * @return String - */ - function getIndexField( $fulltext ) { - return $fulltext ? 'si_text' : 'si_title'; - } - - /** - * Get the base part of the search query. - * The actual match syntax will depend on the server - * version; MySQL 3 and MySQL 4 have different capabilities - * in their fulltext search indexes. - * - * @param $filteredTerm String - * @param $fulltext Boolean - * @return String - */ - function queryMain( $filteredTerm, $fulltext ) { - $match = $this->parseQuery( $filteredTerm, $fulltext ); - $page = $this->db->tableName( 'page' ); - $searchindex = $this->db->tableName( 'searchindex' ); - return 'SELECT page_id, page_namespace, page_title ' . - "FROM $page,$searchindex " . - 'WHERE page_id=si_page AND ' . $match; - } - - function getCountQuery( $filteredTerm, $fulltext ) { - $match = $this->parseQuery( $filteredTerm, $fulltext ); - $page = $this->db->tableName( 'page' ); - $searchindex = $this->db->tableName( 'searchindex' ); - return "SELECT COUNT(*) AS c " . - "FROM $page,$searchindex " . - 'WHERE page_id=si_page AND ' . $match . - $this->queryRedirect() . ' ' . - $this->queryNamespaces(); - } - - /** - * Create or update the search index record for the given page. - * Title and text should be pre-processed. - * - * @param $id Integer - * @param $title String - * @param $text String - */ - function update( $id, $title, $text ) { - $dbw = wfGetDB( DB_MASTER ); - $dbw->replace( 'searchindex', - array( 'si_page' ), - array( - 'si_page' => $id, - 'si_title' => $title, - 'si_text' => $text - ), __METHOD__ ); - } - - /** - * Update a search index record's title only. - * Title should be pre-processed. - * - * @param $id Integer - * @param $title String - */ - function updateTitle( $id, $title ) { - $dbw = wfGetDB( DB_MASTER ); - - $dbw->update( 'searchindex', - array( 'si_title' => $title ), - array( 'si_page' => $id ), - __METHOD__, - array( $dbw->lowPriorityOption() ) ); - } -} - -/** - * @ingroup Search - */ -class MySQLSearchResultSet extends SearchResultSet { - function MySQLSearchResultSet( $resultSet, $terms, $totalHits=null ) { - $this->mResultSet = $resultSet; - $this->mTerms = $terms; - $this->mTotalHits = $totalHits; - } - - function termMatches() { - return $this->mTerms; - } - - function numRows() { - return $this->mResultSet->numRows(); - } - - function next() { - $row = $this->mResultSet->fetchObject(); - if( $row === false ) { - return false; - } else { - return new SearchResult( $row ); - } - } - - function free() { - $this->mResultSet->free(); - } - - - function getTotalHits() { - return $this->mTotalHits; - } -} \ No newline at end of file diff --git a/includes/SearchMySQL4.php b/includes/SearchMySQL4.php deleted file mode 100644 index 3e2bb2d1dd..0000000000 --- a/includes/SearchMySQL4.php +++ /dev/null @@ -1,34 +0,0 @@ - -# http://www.mediawiki.org/ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# http://www.gnu.org/copyleft/gpl.html - -/** - * @file - * @ingroup Search - */ - -/** - * Search engine hook for MySQL 4+ - * This class retained for backwards compatibility... - * The meat's been moved to SearchMySQL, since the 3.x variety is gone. - * @ingroup Search - * @deprecated - */ -class SearchMySQL4 extends SearchMySQL { - /* whee */ -} diff --git a/includes/SearchOracle.php b/includes/SearchOracle.php deleted file mode 100644 index 3cd91faab8..0000000000 --- a/includes/SearchOracle.php +++ /dev/null @@ -1,259 +0,0 @@ - -# http://www.mediawiki.org/ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# http://www.gnu.org/copyleft/gpl.html - -/** - * @file - * @ingroup Search - */ - -/** - * Search engine hook base class for Oracle (ConText). - * @ingroup Search - */ -class SearchOracle extends SearchEngine { - function __construct($db) { - $this->db = $db; - } - - /** - * Perform a full text search query and return a result set. - * - * @param $term String: raw search term - * @return OracleSearchResultSet - */ - function searchText( $term ) { - if ($term == '') - return new OracleSearchResultSet(false, ''); - - $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), true))); - return new OracleSearchResultSet($resultSet, $this->searchTerms); - } - - /** - * Perform a title-only search query and return a result set. - * - * @param $term String: raw search term - * @return ORacleSearchResultSet - */ - function searchTitle($term) { - if ($term == '') - return new OracleSearchResultSet(false, ''); - - $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), false))); - return new MySQLSearchResultSet($resultSet, $this->searchTerms); - } - - - /** - * Return a partial WHERE clause to exclude redirects, if so set - * @return String - */ - function queryRedirect() { - if ($this->showRedirects) { - return ''; - } else { - return 'AND page_is_redirect=0'; - } - } - - /** - * Return a partial WHERE clause to limit the search to the given namespaces - * @return String - */ - function queryNamespaces() { - if( is_null($this->namespaces) ) - return ''; - if ( !count( $this->namespaces ) ) { - $namespaces = '0'; - } else { - $namespaces = $this->db->makeList( $this->namespaces ); - } - return 'AND page_namespace IN (' . $namespaces . ')'; - } - - /** - * Return a LIMIT clause to limit results on the query. - * @return String - */ - function queryLimit($sql) { - return $this->db->limitResult($sql, $this->limit, $this->offset); - } - - /** - * Does not do anything for generic search engine - * subclasses may define this though - * @return String - */ - function queryRanking($filteredTerm, $fulltext) { - return ' ORDER BY score(1)'; - } - - /** - * Construct the full SQL query to do the search. - * The guts shoulds be constructed in queryMain() - * @param $filteredTerm String - * @param $fulltext Boolean - */ - function getQuery( $filteredTerm, $fulltext ) { - return $this->queryLimit($this->queryMain($filteredTerm, $fulltext) . ' ' . - $this->queryRedirect() . ' ' . - $this->queryNamespaces() . ' ' . - $this->queryRanking( $filteredTerm, $fulltext ) . ' '); - } - - - /** - * Picks which field to index on, depending on what type of query. - * @param $fulltext Boolean - * @return String - */ - function getIndexField($fulltext) { - return $fulltext ? 'si_text' : 'si_title'; - } - - /** - * Get the base part of the search query. - * - * @param $filteredTerm String - * @param $fulltext Boolean - * @return String - */ - function queryMain( $filteredTerm, $fulltext ) { - $match = $this->parseQuery($filteredTerm, $fulltext); - $page = $this->db->tableName('page'); - $searchindex = $this->db->tableName('searchindex'); - return 'SELECT page_id, page_namespace, page_title ' . - "FROM $page,$searchindex " . - 'WHERE page_id=si_page AND ' . $match; - } - - /** - * Parse a user input search string, and return an SQL fragment to be used - * as part of a WHERE clause - */ - function parseQuery($filteredText, $fulltext) { - global $wgContLang; - $lc = SearchEngine::legalSearchChars(); - $this->searchTerms = array(); - - # FIXME: This doesn't handle parenthetical expressions. - $m = array(); - $q = array(); - - if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', - $filteredText, $m, PREG_SET_ORDER)) { - foreach($m as $terms) { - - // Search terms in all variant forms, only - // apply on wiki with LanguageConverter - $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] ); - if( is_array( $temp_terms )) { - $temp_terms = array_unique( array_values( $temp_terms )); - foreach( $temp_terms as $t ) - $q[] = $terms[1] . $wgContLang->stripForSearch( $t ); - } - else - $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] ); - - if (!empty($terms[3])) { - $regexp = preg_quote( $terms[3], '/' ); - if ($terms[4]) - $regexp .= "[0-9A-Za-z_]+"; - } else { - $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); - } - $this->searchTerms[] = $regexp; - } - } - - $searchon = $this->db->addQuotes(join(',', $q)); - $field = $this->getIndexField($fulltext); - return " CONTAINS($field, $searchon, 1) > 0 "; - } - - /** - * Create or update the search index record for the given page. - * Title and text should be pre-processed. - * - * @param $id Integer - * @param $title String - * @param $text String - */ - function update($id, $title, $text) { - $dbw = wfGetDB(DB_MASTER); - $dbw->replace('searchindex', - array('si_page'), - array( - 'si_page' => $id, - 'si_title' => $title, - 'si_text' => $text - ), 'SearchOracle::update' ); - $dbw->query("CALL ctx_ddl.sync_index('si_text_idx')"); - $dbw->query("CALL ctx_ddl.sync_index('si_title_idx')"); - } - - /** - * Update a search index record's title only. - * Title should be pre-processed. - * - * @param int $id - * @param string $title - */ - function updateTitle($id, $title) { - $dbw = wfGetDB(DB_MASTER); - - $dbw->update('searchindex', - array('si_title' => $title), - array('si_page' => $id), - 'SearchOracle::updateTitle', - array()); - } -} - -/** - * @ingroup Search - */ -class OracleSearchResultSet extends SearchResultSet { - - function __construct($resultSet, $terms) { - $this->mResultSet = $resultSet; - $this->mTerms = $terms; - } - - function termMatches() { - return $this->mTerms; - } - - function numRows() { - if ($this->mResultSet === false ) - return 0; - else - return $this->mResultSet->numRows(); - } - - function next() { - if ($this->mResultSet === false ) - return false; - - $row = $this->mResultSet->fetchObject(); - if ($row === false) - return false; - return new SearchResult($row); - } -} diff --git a/includes/SearchPostgres.php b/includes/SearchPostgres.php deleted file mode 100644 index 81e9e65ca5..0000000000 --- a/includes/SearchPostgres.php +++ /dev/null @@ -1,255 +0,0 @@ - -# http://www.mediawiki.org/ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# http://www.gnu.org/copyleft/gpl.html - -/** - * @file - * @ingroup Search - */ - -/** - * Search engine hook base class for Postgres - * @ingroup Search - */ -class SearchPostgres extends SearchEngine { - - function __construct( $db ) { - $this->db = $db; - } - - /** - * Perform a full text search query via tsearch2 and return a result set. - * Currently searches a page's current title (page.page_title) and - * latest revision article text (pagecontent.old_text) - * - * @param $term String: raw search term - * @return PostgresSearchResultSet - */ - function searchTitle( $term ) { - $q = $this->searchQuery( $term , 'titlevector', 'page_title' ); - $olderror = error_reporting(E_ERROR); - $resultSet = $this->db->resultObject( $this->db->query( $q, 'SearchPostgres', true ) ); - error_reporting($olderror); - if (!$resultSet) { - // Needed for "Query requires full scan, GIN doesn't support it" - return new SearchResultTooMany(); - } - return new PostgresSearchResultSet( $resultSet, $this->searchTerms ); - } - function searchText( $term ) { - $q = $this->searchQuery( $term, 'textvector', 'old_text' ); - $olderror = error_reporting(E_ERROR); - $resultSet = $this->db->resultObject( $this->db->query( $q, 'SearchPostgres', true ) ); - error_reporting($olderror); - if (!$resultSet) { - return new SearchResultTooMany(); - } - return new PostgresSearchResultSet( $resultSet, $this->searchTerms ); - } - - - /* - * Transform the user's search string into a better form for tsearch2 - * Returns an SQL fragment consisting of quoted text to search for. - */ - function parseQuery( $term ) { - - wfDebug( "parseQuery received: $term \n" ); - - ## No backslashes allowed - $term = preg_replace('/\\\/', '', $term); - - ## Collapse parens into nearby words: - $term = preg_replace('/\s*\(\s*/', ' (', $term); - $term = preg_replace('/\s*\)\s*/', ') ', $term); - - ## Treat colons as word separators: - $term = preg_replace('/:/', ' ', $term); - - $searchstring = ''; - $m = array(); - if( preg_match_all('/([-!]?)(\S+)\s*/', $term, $m, PREG_SET_ORDER ) ) { - foreach( $m as $terms ) { - if (strlen($terms[1])) { - $searchstring .= ' & !'; - } - if (strtolower($terms[2]) === 'and') { - $searchstring .= ' & '; - } - else if (strtolower($terms[2]) === 'or' or $terms[2] === '|') { - $searchstring .= ' | '; - } - else if (strtolower($terms[2]) === 'not') { - $searchstring .= ' & !'; - } - else { - $searchstring .= " & $terms[2]"; - } - } - } - - ## Strip out leading junk - $searchstring = preg_replace('/^[\s\&\|]+/', '', $searchstring); - - ## Remove any doubled-up operators - $searchstring = preg_replace('/([\!\&\|]) +(?:[\&\|] +)+/', "$1 ", $searchstring); - - ## Remove any non-spaced operators (e.g. "Zounds!") - $searchstring = preg_replace('/([^ ])[\!\&\|]/', "$1", $searchstring); - - ## Remove any trailing whitespace or operators - $searchstring = preg_replace('/[\s\!\&\|]+$/', '', $searchstring); - - ## Remove unnecessary quotes around everything - $searchstring = preg_replace('/^[\'"](.*)[\'"]$/', "$1", $searchstring); - - ## Quote the whole thing - $searchstring = $this->db->addQuotes($searchstring); - - wfDebug( "parseQuery returned: $searchstring \n" ); - - return $searchstring; - - } - - /** - * Construct the full SQL query to do the search. - * @param $filteredTerm String - * @param $fulltext String - */ - function searchQuery( $term, $fulltext, $colname ) { - global $wgDBversion; - - if ( !isset( $wgDBversion ) ) { - $this->db->getServerVersion(); - $wgDBversion = $this->db->numeric_version; - } - $prefix = $wgDBversion < 8.3 ? "'default'," : ''; - - # Get the SQL fragment for the given term - $searchstring = $this->parseQuery( $term ); - - ## We need a separate query here so gin does not complain about empty searches - $SQL = "SELECT to_tsquery($prefix $searchstring)"; - $res = $this->db->doQuery($SQL); - if (!$res) { - ## TODO: Better output (example to catch: one 'two) - die ("Sorry, that was not a valid search string. Please go back and try again"); - } - $top = pg_fetch_result($res,0,0); - - if ($top === "") { ## e.g. if only stopwords are used XXX return something better - $query = "SELECT page_id, page_namespace, page_title, 0 AS score ". - "FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " . - "AND r.rev_text_id = c.old_id AND 1=0"; - } - else { - $m = array(); - if( preg_match_all("/'([^']+)'/", $top, $m, PREG_SET_ORDER ) ) { - foreach( $m as $terms ) { - $this->searchTerms[$terms[1]] = $terms[1]; - } - } - - $rankscore = $wgDBversion > 8.2 ? 5 : 1; - $rank = $wgDBversion < 8.3 ? 'rank' : 'ts_rank'; - $query = "SELECT page_id, page_namespace, page_title, ". - "$rank($fulltext, to_tsquery($prefix $searchstring), $rankscore) AS score ". - "FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " . - "AND r.rev_text_id = c.old_id AND $fulltext @@ to_tsquery($prefix $searchstring)"; - } - - ## Redirects - if (! $this->showRedirects) - $query .= ' AND page_is_redirect = 0'; - - ## Namespaces - defaults to 0 - if( !is_null($this->namespaces) ){ // null -> search all - if ( count($this->namespaces) < 1) - $query .= ' AND page_namespace = 0'; - else { - $namespaces = $this->db->makeList( $this->namespaces ); - $query .= " AND page_namespace IN ($namespaces)"; - } - } - - $query .= " ORDER BY score DESC, page_id DESC"; - - $query .= $this->db->limitResult( '', $this->limit, $this->offset ); - - wfDebug( "searchQuery returned: $query \n" ); - - return $query; - } - - ## Most of the work of these two functions are done automatically via triggers - - function update( $pageid, $title, $text ) { - ## We don't want to index older revisions - $SQL = "UPDATE pagecontent SET textvector = NULL WHERE old_id IN ". - "(SELECT rev_text_id FROM revision WHERE rev_page = " . intval( $pageid ) . - " ORDER BY rev_text_id DESC OFFSET 1)"; - $this->db->doQuery($SQL); - return true; - } - - function updateTitle( $id, $title ) { - return true; - } - -} ## end of the SearchPostgres class - -/** - * @ingroup Search - */ -class PostgresSearchResult extends SearchResult { - function __construct( $row ) { - parent::__construct($row); - $this->score = $row->score; - } - function getScore() { - return $this->score; - } -} - -/** - * @ingroup Search - */ -class PostgresSearchResultSet extends SearchResultSet { - function __construct( $resultSet, $terms ) { - $this->mResultSet = $resultSet; - $this->mTerms = $terms; - } - - function termMatches() { - return $this->mTerms; - } - - function numRows() { - return $this->mResultSet->numRows(); - } - - function next() { - $row = $this->mResultSet->fetchObject(); - if( $row === false ) { - return false; - } else { - return new PostgresSearchResult( $row ); - } - } -} diff --git a/includes/SearchUpdate.php b/includes/SearchUpdate.php deleted file mode 100644 index 087a8ba5dc..0000000000 --- a/includes/SearchUpdate.php +++ /dev/null @@ -1,113 +0,0 @@ -mId = $id; - $this->mText = $text; - - $this->mNamespace = $nt->getNamespace(); - $this->mTitle = $nt->getText(); # Discard namespace - - $this->mTitleWords = $this->mTextWords = array(); - } else { - wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); - } - } - - function doUpdate() { - global $wgContLang, $wgDisableSearchUpdate; - - if( $wgDisableSearchUpdate || !$this->mId ) { - return false; - } - $fname = 'SearchUpdate::doUpdate'; - wfProfileIn( $fname ); - - $search = SearchEngine::create(); - $lc = SearchEngine::legalSearchChars() . '&#;'; - - if( $this->mText === false ) { - $search->updateTitle($this->mId, - Title::indexTitle( $this->mNamespace, $this->mTitle )); - wfProfileOut( $fname ); - return; - } - - # Language-specific strip/conversion - $text = $wgContLang->stripForSearch( $this->mText ); - - wfProfileIn( $fname.'-regexps' ); - $text = preg_replace( "/<\\/?\\s*[A-Za-z][A-Za-z0-9]*\\s*([^>]*?)>/", - ' ', strtolower( " " . $text /*$this->mText*/ . " " ) ); # Strip HTML markup - $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", - "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings - - # Strip external URLs - $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\xA0-\\xFF"; - $protos = "http|https|ftp|mailto|news|gopher"; - $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; - $text = preg_replace( $pat, "\\1 \\3", $text ); - - $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; - $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; - $text = preg_replace( $p1, "\\1 ", $text ); - $text = preg_replace( $p2, "\\1 \\3 ", $text ); - - # Internal image links - $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; - $text = preg_replace( $pat2, " \\1 \\3", $text ); - - $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", - "\\1\\2 \\2\\3", $text ); # Handle [[game]]s - - # Strip all remaining non-search characters - $text = preg_replace( "/[^{$lc}]+/", " ", $text ); - - # Handle 's, s' - # - # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); - # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); - # - # These tail-anchored regexps are insanely slow. The worst case comes - # when Japanese or Chinese text (ie, no word spacing) is written on - # a wiki configured for Western UTF-8 mode. The Unicode characters are - # expanded to hex codes and the "words" are very long paragraph-length - # monstrosities. On a large page the above regexps may take over 20 - # seconds *each* on a 1GHz-level processor. - # - # Following are reversed versions which are consistently fast - # (about 3 milliseconds on 1GHz-level processor). - # - $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); - $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); - - # Strip wiki '' and ''' - $text = preg_replace( "/''[']*/", " ", $text ); - wfProfileOut( "$fname-regexps" ); - - wfRunHooks( 'SearchUpdate', array( $this->mId, $this->mNamespace, $this->mTitle, &$text ) ); - - # Perform the actual update - $search->update($this->mId, Title::indexTitle( $this->mNamespace, $this->mTitle ), - $text); - - wfProfileOut( $fname ); - } -} - -/** - * Placeholder class - * @ingroup Search - */ -class SearchUpdateMyISAM extends SearchUpdate { - # Inherits everything -} diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php new file mode 100644 index 0000000000..aab14cbba1 --- /dev/null +++ b/includes/search/SearchEngine.php @@ -0,0 +1,1193 @@ + test prefix:Main Page/Archive + */ + function transformSearchTerm( $term ) { + return $term; + } + + /** + * If an exact title match can be find, or a very slightly close match, + * return the title. If no match, returns NULL. + * + * @param $searchterm String + * @return Title + */ + public static function getNearMatch( $searchterm ) { + global $wgContLang, $wgSecondaryGoNamespaces; + + $allSearchTerms = array($searchterm); + + if($wgContLang->hasVariants()){ + $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm)); + } + + foreach($allSearchTerms as $term){ + + # Exact match? No need to look further. + $title = Title::newFromText( $term ); + if (is_null($title)) + return NULL; + + if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) { + return $title; + } + + # See if it still otherwise has content is some sane sense + $article = MediaWiki::articleFromTitle( $title ); + if( $article->hasViewableContent() ) { + return $title; + } + + # If a match is not found in the main namespace look in secondary go namespaces. + if( $wgSecondaryGoNamespaces && $title->getNamespace() == NS_MAIN ) { + foreach( $wgSecondaryGoNamespaces as $ns ) { + $title = Title::newFromText( $term, $ns ); + if( $title && $title->exists() ) return $title; + } + } + + # Now try all lower case (i.e. first letter capitalized) + # + $title = Title::newFromText( $wgContLang->lc( $term ) ); + if ( $title && $title->exists() ) { + return $title; + } + + # Now try capitalized string + # + $title = Title::newFromText( $wgContLang->ucwords( $term ) ); + if ( $title && $title->exists() ) { + return $title; + } + + # Now try all upper case + # + $title = Title::newFromText( $wgContLang->uc( $term ) ); + if ( $title && $title->exists() ) { + return $title; + } + + # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc + $title = Title::newFromText( $wgContLang->ucwordbreaks($term) ); + if ( $title && $title->exists() ) { + return $title; + } + + // Give hooks a chance at better match variants + $title = null; + if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) { + return $title; + } + } + + $title = Title::newFromText( $searchterm ); + + # Entering an IP address goes to the contributions page + if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) ) + || User::isIP( trim( $searchterm ) ) ) { + return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() ); + } + + + # Entering a user goes to the user page whether it's there or not + if ( $title->getNamespace() == NS_USER ) { + return $title; + } + + # Go to images that exist even if there's no local page. + # There may have been a funny upload, or it may be on a shared + # file repository such as Wikimedia Commons. + if( $title->getNamespace() == NS_FILE ) { + $image = wfFindFile( $title ); + if( $image ) { + return $title; + } + } + + # MediaWiki namespace? Page may be "implied" if not customized. + # Just return it, with caps forced as the message system likes it. + if( $title->getNamespace() == NS_MEDIAWIKI ) { + return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) ); + } + + # Quoted term? Try without the quotes... + $matches = array(); + if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) { + return SearchEngine::getNearMatch( $matches[1] ); + } + + return NULL; + } + + public static function legalSearchChars() { + return "A-Za-z_'.0-9\\x80-\\xFF\\-"; + } + + /** + * Set the maximum number of results to return + * and how many to skip before returning the first. + * + * @param $limit Integer + * @param $offset Integer + */ + function setLimitOffset( $limit, $offset = 0 ) { + $this->limit = intval( $limit ); + $this->offset = intval( $offset ); + } + + /** + * Set which namespaces the search should include. + * Give an array of namespace index numbers. + * + * @param $namespaces Array + */ + function setNamespaces( $namespaces ) { + $this->namespaces = $namespaces; + } + + /** + * Parse some common prefixes: all (search everything) + * or namespace names + * + * @param $query String + */ + function replacePrefixes( $query ){ + global $wgContLang; + + if( strpos($query,':') === false ) + return $query; // nothing to do + + $parsed = $query; + $allkeyword = wfMsgForContent('searchall').":"; + if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){ + $this->namespaces = null; + $parsed = substr($query,strlen($allkeyword)); + } else if( strpos($query,':') !== false ) { + $prefix = substr($query,0,strpos($query,':')); + $index = $wgContLang->getNsIndex($prefix); + if($index !== false){ + $this->namespaces = array($index); + $parsed = substr($query,strlen($prefix)+1); + } + } + if(trim($parsed) == '') + return $query; // prefix was the whole query + + return $parsed; + } + + /** + * Make a list of searchable namespaces and their canonical names. + * @return Array + */ + public static function searchableNamespaces() { + global $wgContLang; + $arr = array(); + foreach( $wgContLang->getNamespaces() as $ns => $name ) { + if( $ns >= NS_MAIN ) { + $arr[$ns] = $name; + } + } + return $arr; + } + + /** + * Extract default namespaces to search from the given user's + * settings, returning a list of index numbers. + * + * @param $user User + * @return Array + */ + public static function userNamespaces( $user ) { + global $wgSearchEverythingOnlyLoggedIn; + + // get search everything preference, that can be set to be read for logged-in users + $searcheverything = false; + if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() ) + || !$wgSearchEverythingOnlyLoggedIn ) + $searcheverything = $user->getOption('searcheverything'); + + // searcheverything overrides other options + if( $searcheverything ) + return array_keys(SearchEngine::searchableNamespaces()); + + $arr = Preferences::loadOldSearchNs( $user ); + $searchableNamespaces = SearchEngine::searchableNamespaces(); + + $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter + + return $arr; + } + + /** + * Find snippet highlight settings for a given user + * + * @param $user User + * @return Array contextlines, contextchars + */ + public static function userHighlightPrefs( &$user ){ + //$contextlines = $user->getOption( 'contextlines', 5 ); + //$contextchars = $user->getOption( 'contextchars', 50 ); + $contextlines = 2; // Hardcode this. Old defaults sucked. :) + $contextchars = 75; // same as above.... :P + return array($contextlines, $contextchars); + } + + /** + * An array of namespaces indexes to be searched by default + * + * @return Array + */ + public static function defaultNamespaces(){ + global $wgNamespacesToBeSearchedDefault; + + return array_keys($wgNamespacesToBeSearchedDefault, true); + } + + /** + * Get a list of namespace names useful for showing in tooltips + * and preferences + * + * @param $namespaces Array + */ + public static function namespacesAsText( $namespaces ){ + global $wgContLang; + + $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces ); + foreach( $formatted as $key => $ns ){ + if ( empty($ns) ) + $formatted[$key] = wfMsg( 'blanknamespace' ); + } + return $formatted; + } + + /** + * Return the help namespaces to be shown on Special:Search + * + * @return Array + */ + public static function helpNamespaces() { + global $wgNamespacesToBeSearchedHelp; + + return array_keys( $wgNamespacesToBeSearchedHelp, true ); + } + + /** + * Return a 'cleaned up' search string + * + * @param $text String + * @return String + */ + function filter( $text ) { + $lc = $this->legalSearchChars(); + return trim( preg_replace( "/[^{$lc}]/", " ", $text ) ); + } + /** + * Load up the appropriate search engine class for the currently + * active database backend, and return a configured instance. + * + * @return SearchEngine + */ + public static function create() { + global $wgSearchType; + $dbr = wfGetDB( DB_SLAVE ); + if( $wgSearchType ) { + $class = $wgSearchType; + } else { + $class = $dbr->getSearchEngine(); + } + $search = new $class( $dbr ); + $search->setLimitOffset(0,0); + return $search; + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * STUB + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update( $id, $title, $text ) { + // no-op + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * STUB + * + * @param $id Integer + * @param $title String + */ + function updateTitle( $id, $title ) { + // no-op + } + + /** + * Get OpenSearch suggestion template + * + * @return String + */ + public static function getOpenSearchTemplate() { + global $wgOpenSearchTemplate, $wgServer, $wgScriptPath; + if( $wgOpenSearchTemplate ) { + return $wgOpenSearchTemplate; + } else { + $ns = implode( '|', SearchEngine::defaultNamespaces() ); + if( !$ns ) $ns = "0"; + return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns; + } + } + + /** + * Get internal MediaWiki Suggest template + * + * @return String + */ + public static function getMWSuggestTemplate() { + global $wgMWSuggestTemplate, $wgServer, $wgScriptPath; + if($wgMWSuggestTemplate) + return $wgMWSuggestTemplate; + else + return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest'; + } +} + +/** + * @ingroup Search + */ +class SearchResultSet { + /** + * Fetch an array of regular expression fragments for matching + * the search terms as parsed by this engine in a text extract. + * STUB + * + * @return Array + */ + function termMatches() { + return array(); + } + + function numRows() { + return 0; + } + + /** + * Return true if results are included in this result set. + * STUB + * + * @return Boolean + */ + function hasResults() { + return false; + } + + /** + * Some search modes return a total hit count for the query + * in the entire article database. This may include pages + * in namespaces that would not be matched on the given + * settings. + * + * Return null if no total hits number is supported. + * + * @return Integer + */ + function getTotalHits() { + return null; + } + + /** + * Some search modes return a suggested alternate term if there are + * no exact hits. Returns true if there is one on this set. + * + * @return Boolean + */ + function hasSuggestion() { + return false; + } + + /** + * @return String: suggested query, null if none + */ + function getSuggestionQuery(){ + return null; + } + + /** + * @return String: HTML highlighted suggested query, '' if none + */ + function getSuggestionSnippet(){ + return ''; + } + + /** + * Return information about how and from where the results were fetched, + * should be useful for diagnostics and debugging + * + * @return String + */ + function getInfo() { + return null; + } + + /** + * Return a result set of hits on other (multiple) wikis associated with this one + * + * @return SearchResultSet + */ + function getInterwikiResults() { + return null; + } + + /** + * Check if there are results on other wikis + * + * @return Boolean + */ + function hasInterwikiResults() { + return $this->getInterwikiResults() != null; + } + + + /** + * Fetches next search result, or false. + * STUB + * + * @return SearchResult + */ + function next() { + return false; + } + + /** + * Frees the result set, if applicable. + */ + function free() { + // ... + } +} + + +/** + * @ingroup Search + */ +class SearchResultTooMany { + ## Some search engines may bail out if too many matches are found +} + + +/** + * @todo Fixme: This class is horribly factored. It would probably be better to + * have a useful base class to which you pass some standard information, then + * let the fancy self-highlighters extend that. + * @ingroup Search + */ +class SearchResult { + var $mRevision = null; + var $mImage = null; + + function __construct( $row ) { + $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title ); + if( !is_null($this->mTitle) ){ + $this->mRevision = Revision::newFromTitle( $this->mTitle ); + if( $this->mTitle->getNamespace() === NS_FILE ) + $this->mImage = wfFindFile( $this->mTitle ); + } + } + + /** + * Check if this is result points to an invalid title + * + * @return Boolean + */ + function isBrokenTitle(){ + if( is_null($this->mTitle) ) + return true; + return false; + } + + /** + * Check if target page is missing, happens when index is out of date + * + * @return Boolean + */ + function isMissingRevision(){ + return !$this->mRevision && !$this->mImage; + } + + /** + * @return Title + */ + function getTitle() { + return $this->mTitle; + } + + /** + * @return Double or null if not supported + */ + function getScore() { + return null; + } + + /** + * Lazy initialization of article text from DB + */ + protected function initText(){ + if( !isset($this->mText) ){ + if($this->mRevision != null) + $this->mText = $this->mRevision->getText(); + else // TODO: can we fetch raw wikitext for commons images? + $this->mText = ''; + + } + } + + /** + * @param $terms Array: terms to highlight + * @return String: highlighted text snippet, null (and not '') if not supported + */ + function getTextSnippet($terms){ + global $wgUser, $wgAdvancedSearchHighlighting; + $this->initText(); + list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser); + $h = new SearchHighlighter(); + if( $wgAdvancedSearchHighlighting ) + return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars ); + else + return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars ); + } + + /** + * @param $terms Array: terms to highlight + * @return String: highlighted title, '' if not supported + */ + function getTitleSnippet($terms){ + return ''; + } + + /** + * @param $terms Array: terms to highlight + * @return String: highlighted redirect name (redirect to this page), '' if none or not supported + */ + function getRedirectSnippet($terms){ + return ''; + } + + /** + * @return Title object for the redirect to this page, null if none or not supported + */ + function getRedirectTitle(){ + return null; + } + + /** + * @return string highlighted relevant section name, null if none or not supported + */ + function getSectionSnippet(){ + return ''; + } + + /** + * @return Title object (pagename+fragment) for the section, null if none or not supported + */ + function getSectionTitle(){ + return null; + } + + /** + * @return String: timestamp + */ + function getTimestamp(){ + if( $this->mRevision ) + return $this->mRevision->getTimestamp(); + else if( $this->mImage ) + return $this->mImage->getTimestamp(); + return ''; + } + + /** + * @return Integer: number of words + */ + function getWordCount(){ + $this->initText(); + return str_word_count( $this->mText ); + } + + /** + * @return Integer: size in bytes + */ + function getByteSize(){ + $this->initText(); + return strlen( $this->mText ); + } + + /** + * @return Boolean if hit has related articles + */ + function hasRelated(){ + return false; + } + + /** + * @return String: interwiki prefix of the title (return iw even if title is broken) + */ + function getInterwikiPrefix(){ + return ''; + } +} + +/** + * Highlight bits of wikitext + * + * @ingroup Search + */ +class SearchHighlighter { + var $mCleanWikitext = true; + + function SearchHighlighter($cleanupWikitext = true){ + $this->mCleanWikitext = $cleanupWikitext; + } + + /** + * Default implementation of wikitext highlighting + * + * @param $text String + * @param $terms Array: terms to highlight (unescaped) + * @param $contextlines Integer + * @param $contextchars Integer + * @return String + */ + public function highlightText( $text, $terms, $contextlines, $contextchars ) { + global $wgLang, $wgContLang; + global $wgSearchHighlightBoundaries; + $fname = __METHOD__; + + if($text == '') + return ''; + + // spli text into text + templates/links/tables + $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)"; + // first capture group is for detecting nested templates/links/tables/references + $endPatterns = array( + 1 => '/(\{\{)|(\}\})/', // template + 2 => '/(\[\[)|(\]\])/', // image + 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table + + // FIXME: this should prolly be a hook or something + if(function_exists('wfCite')){ + $spat .= '|()'; // references via cite extension + $endPatterns[4] = '/()|(<\/ref>)/'; + } + $spat .= '/'; + $textExt = array(); // text extracts + $otherExt = array(); // other extracts + wfProfileIn( "$fname-split" ); + $start = 0; + $textLen = strlen($text); + $count = 0; // sequence number to maintain ordering + while( $start < $textLen ){ + // find start of template/image/table + if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){ + $epat = ''; + foreach($matches as $key => $val){ + if($key > 0 && $val[1] != -1){ + if($key == 2){ + // see if this is an image link + $ns = substr($val[0],2,-1); + if( $wgContLang->getNsIndex($ns) != NS_FILE ) + break; + + } + $epat = $endPatterns[$key]; + $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) ); + $start = $val[1]; + break; + } + } + if( $epat ){ + // find end (and detect any nested elements) + $level = 0; + $offset = $start + 1; + $found = false; + while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){ + if( array_key_exists(2,$endMatches) ){ + // found end + if($level == 0){ + $len = strlen($endMatches[2][0]); + $off = $endMatches[2][1]; + $this->splitAndAdd( $otherExt, $count, + substr( $text, $start, $off + $len - $start ) ); + $start = $off + $len; + $found = true; + break; + } else{ + // end of nested element + $level -= 1; + } + } else{ + // nested + $level += 1; + } + $offset = $endMatches[0][1] + strlen($endMatches[0][0]); + } + if( ! $found ){ + // couldn't find appropriate closing tag, skip + $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) ); + $start += strlen($matches[0][0]); + } + continue; + } + } + // else: add as text extract + $this->splitAndAdd( $textExt, $count, substr($text,$start) ); + break; + } + + $all = $textExt + $otherExt; // these have disjunct key sets + + wfProfileOut( "$fname-split" ); + + // prepare regexps + foreach( $terms as $index => $term ) { + // manually do upper/lowercase stuff for utf-8 since PHP won't do it + if(preg_match('/[\x80-\xff]/', $term) ){ + $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]); + } else { + $terms[$index] = $term; + } + } + $anyterm = implode( '|', $terms ); + $phrase = implode("$wgSearchHighlightBoundaries+", $terms ); + + // FIXME: a hack to scale contextchars, a correct solution + // would be to have contextchars actually be char and not byte + // length, and do proper utf-8 substrings and lengths everywhere, + // but PHP is making that very hard and unclean to implement :( + $scale = strlen($anyterm) / mb_strlen($anyterm); + $contextchars = intval( $contextchars * $scale ); + + $patPre = "(^|$wgSearchHighlightBoundaries)"; + $patPost = "($wgSearchHighlightBoundaries|$)"; + + $pat1 = "/(".$phrase.")/ui"; + $pat2 = "/$patPre(".$anyterm.")$patPost/ui"; + + wfProfileIn( "$fname-extract" ); + + $left = $contextlines; + + $snippets = array(); + $offsets = array(); + + // show beginning only if it contains all words + $first = 0; + $firstText = ''; + foreach($textExt as $index => $line){ + if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){ + $firstText = $this->extract( $line, 0, $contextchars * $contextlines ); + $first = $index; + break; + } + } + if( $firstText ){ + $succ = true; + // check if first text contains all terms + foreach($terms as $term){ + if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){ + $succ = false; + break; + } + } + if( $succ ){ + $snippets[$first] = $firstText; + $offsets[$first] = 0; + } + } + if( ! $snippets ) { + // match whole query on text + $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets); + // match whole query on templates/tables/images + $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets); + // match any words on text + $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets); + // match any words on templates/tables/images + $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets); + + ksort($snippets); + } + + // add extra chars to each snippet to make snippets constant size + $extended = array(); + if( count( $snippets ) == 0){ + // couldn't find the target words, just show beginning of article + $targetchars = $contextchars * $contextlines; + $snippets[$first] = ''; + $offsets[$first] = 0; + } else{ + // if begin of the article contains the whole phrase, show only that !! + if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first]) + && $offsets[$first] < $contextchars * 2 ){ + $snippets = array ($first => $snippets[$first]); + } + + // calc by how much to extend existing snippets + $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) ); + } + + foreach($snippets as $index => $line){ + $extended[$index] = $line; + $len = strlen($line); + if( $len < $targetchars - 20 ){ + // complete this line + if($len < strlen( $all[$index] )){ + $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]); + $len = strlen( $extended[$index] ); + } + + // add more lines + $add = $index + 1; + while( $len < $targetchars - 20 + && array_key_exists($add,$all) + && !array_key_exists($add,$snippets) ){ + $offsets[$add] = 0; + $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] ); + $extended[$add] = $tt; + $len += strlen( $tt ); + $add++; + } + } + } + + //$snippets = array_map('htmlspecialchars', $extended); + $snippets = $extended; + $last = -1; + $extract = ''; + foreach($snippets as $index => $line){ + if($last == -1) + $extract .= $line; // first line + elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last])) + $extract .= " ".$line; // continous lines + else + $extract .= ' ... ' . $line; + + $last = $index; + } + if( $extract ) + $extract .= ' ... '; + + $processed = array(); + foreach($terms as $term){ + if( ! isset($processed[$term]) ){ + $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word + $extract = preg_replace( $pat3, + "\\1\\2\\3", $extract ); + $processed[$term] = true; + } + } + + wfProfileOut( "$fname-extract" ); + + return $extract; + } + + /** + * Split text into lines and add it to extracts array + * + * @param $extracts Array: index -> $line + * @param $count Integer + * @param $text String + */ + function splitAndAdd(&$extracts, &$count, $text){ + $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text ); + foreach($split as $line){ + $tt = trim($line); + if( $tt ) + $extracts[$count++] = $tt; + } + } + + /** + * Do manual case conversion for non-ascii chars + * + * @param $matches Array + */ + function caseCallback($matches){ + global $wgContLang; + if( strlen($matches[0]) > 1 ){ + return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']'; + } else + return $matches[0]; + } + + /** + * Extract part of the text from start to end, but by + * not chopping up words + * @param $text String + * @param $start Integer + * @param $end Integer + * @param $posStart Integer: (out) actual start position + * @param $posEnd Integer: (out) actual end position + * @return String + */ + function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){ + global $wgContLang; + + if( $start != 0) + $start = $this->position( $text, $start, 1 ); + if( $end >= strlen($text) ) + $end = strlen($text); + else + $end = $this->position( $text, $end ); + + if(!is_null($posStart)) + $posStart = $start; + if(!is_null($posEnd)) + $posEnd = $end; + + if($end > $start) + return substr($text, $start, $end-$start); + else + return ''; + } + + /** + * Find a nonletter near a point (index) in the text + * + * @param $text String + * @param $point Integer + * @param $offset Integer: offset to found index + * @return Integer: nearest nonletter index, or beginning of utf8 char if none + */ + function position($text, $point, $offset=0 ){ + $tolerance = 10; + $s = max( 0, $point - $tolerance ); + $l = min( strlen($text), $point + $tolerance ) - $s; + $m = array(); + if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){ + return $m[0][1] + $s + $offset; + } else{ + // check if point is on a valid first UTF8 char + $char = ord( $text[$point] ); + while( $char >= 0x80 && $char < 0xc0 ) { + // skip trailing bytes + $point++; + if($point >= strlen($text)) + return strlen($text); + $char = ord( $text[$point] ); + } + return $point; + + } + } + + /** + * Search extracts for a pattern, and return snippets + * + * @param $pattern String: regexp for matching lines + * @param $extracts Array: extracts to search + * @param $linesleft Integer: number of extracts to make + * @param $contextchars Integer: length of snippet + * @param $out Array: map for highlighted snippets + * @param $offsets Array: map of starting points of snippets + * @protected + */ + function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){ + if($linesleft == 0) + return; // nothing to do + foreach($extracts as $index => $line){ + if( array_key_exists($index,$out) ) + continue; // this line already highlighted + + $m = array(); + if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) + continue; + + $offset = $m[0][1]; + $len = strlen($m[0][0]); + if($offset + $len < $contextchars) + $begin = 0; + elseif( $len > $contextchars) + $begin = $offset; + else + $begin = $offset + intval( ($len - $contextchars) / 2 ); + + $end = $begin + $contextchars; + + $posBegin = $begin; + // basic snippet from this line + $out[$index] = $this->extract($line,$begin,$end,$posBegin); + $offsets[$index] = $posBegin; + $linesleft--; + if($linesleft == 0) + return; + } + } + + /** + * Basic wikitext removal + * @protected + */ + function removeWiki($text) { + $fname = __METHOD__; + wfProfileIn( $fname ); + + //$text = preg_replace("/'{2,5}/", "", $text); + //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text); + //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text); + //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text); + //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text); + //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text); + $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text); + $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text); + $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text); + $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text); + //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text); + $text = preg_replace("/<\/?[^>]+>/", "", $text); + $text = preg_replace("/'''''/", "", $text); + $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text); + $text = preg_replace("/''/", "", $text); + + wfProfileOut( $fname ); + return $text; + } + + /** + * callback to replace [[target|caption]] kind of links, if + * the target is category or image, leave it + * + * @param $matches Array + */ + function linkReplace($matches){ + $colon = strpos( $matches[1], ':' ); + if( $colon === false ) + return $matches[2]; // replace with caption + global $wgContLang; + $ns = substr( $matches[1], 0, $colon ); + $index = $wgContLang->getNsIndex($ns); + if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) ) + return $matches[0]; // return the whole thing + else + return $matches[2]; + + } + + /** + * Simple & fast snippet extraction, but gives completely unrelevant + * snippets + * + * @param $text String + * @param $terms Array + * @param $contextlines Integer + * @param $contextchars Integer + * @return String + */ + public function highlightSimple( $text, $terms, $contextlines, $contextchars ) { + global $wgLang, $wgContLang; + $fname = __METHOD__; + + $lines = explode( "\n", $text ); + + $terms = implode( '|', $terms ); + $max = intval( $contextchars ) + 1; + $pat1 = "/(.*)($terms)(.{0,$max})/i"; + + $lineno = 0; + + $extract = ""; + wfProfileIn( "$fname-extract" ); + foreach ( $lines as $line ) { + if ( 0 == $contextlines ) { + break; + } + ++$lineno; + $m = array(); + if ( ! preg_match( $pat1, $line, $m ) ) { + continue; + } + --$contextlines; + $pre = $wgContLang->truncate( $m[1], -$contextchars ); + + if ( count( $m ) < 3 ) { + $post = ''; + } else { + $post = $wgContLang->truncate( $m[3], $contextchars ); + } + + $found = $m[2]; + + $line = htmlspecialchars( $pre . $found . $post ); + $pat2 = '/(' . $terms . ")/i"; + $line = preg_replace( $pat2, + "\\1", $line ); + + $extract .= "${line}\n"; + } + wfProfileOut( "$fname-extract" ); + + return $extract; + } + +} + +/** + * Dummy class to be used when non-supported Database engine is present. + * @todo Fixme: dummy class should probably try something at least mildly useful, + * such as a LIKE search through titles. + * @ingroup Search + */ +class SearchEngineDummy extends SearchEngine { + // no-op +} diff --git a/includes/search/SearchIBM_DB2.php b/includes/search/SearchIBM_DB2.php new file mode 100644 index 0000000000..b94a478236 --- /dev/null +++ b/includes/search/SearchIBM_DB2.php @@ -0,0 +1,249 @@ + +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook base class for IBM DB2 + * @ingroup Search + */ +class SearchIBM_DB2 extends SearchEngine { + function __construct($db) { + $this->db = $db; + } + + /** + * Perform a full text search query and return a result set. + * + * @param $term String: raw search term + * @return IBM_DB2SearchResultSet + */ + function searchText( $term ) { + $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), true))); + return new IBM_DB2SearchResultSet($resultSet, $this->searchTerms); + } + + /** + * Perform a title-only search query and return a result set. + * + * @param $term String: taw search term + * @return IBM_DB2SearchResultSet + */ + function searchTitle($term) { + $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), false))); + return new MySQLSearchResultSet($resultSet, $this->searchTerms); + } + + + /** + * Return a partial WHERE clause to exclude redirects, if so set + * @return String + */ + function queryRedirect() { + if ($this->showRedirects) { + return ''; + } else { + return 'AND page_is_redirect=0'; + } + } + + /** + * Return a partial WHERE clause to limit the search to the given namespaces + * @return String + */ + function queryNamespaces() { + if( is_null($this->namespaces) ) + return ''; + $namespaces = implode(',', $this->namespaces); + if ($namespaces == '') { + $namespaces = '0'; + } + return 'AND page_namespace IN (' . $namespaces . ')'; + } + + /** + * Return a LIMIT clause to limit results on the query. + * @return String + */ + function queryLimit($sql) { + return $this->db->limitResult($sql, $this->limit, $this->offset); + } + + /** + * Does not do anything for generic search engine + * subclasses may define this though + * @return String + */ + function queryRanking($filteredTerm, $fulltext) { + // requires Net Search Extender or equivalent + // return ' ORDER BY score(1)'; + return ''; + } + + /** + * Construct the full SQL query to do the search. + * The guts shoulds be constructed in queryMain() + * @param string $filteredTerm String + * @param bool $fulltext Boolean + */ + function getQuery( $filteredTerm, $fulltext ) { + return $this->queryLimit($this->queryMain($filteredTerm, $fulltext) . ' ' . + $this->queryRedirect() . ' ' . + $this->queryNamespaces() . ' ' . + $this->queryRanking( $filteredTerm, $fulltext ) . ' '); + } + + + /** + * Picks which field to index on, depending on what type of query. + * @param $fulltext Boolean + * @return String + */ + function getIndexField($fulltext) { + return $fulltext ? 'si_text' : 'si_title'; + } + + /** + * Get the base part of the search query. + * + * @param string $filteredTerm String + * @param bool $fulltext Boolean + * @return String + */ + function queryMain( $filteredTerm, $fulltext ) { + $match = $this->parseQuery($filteredTerm, $fulltext); + $page = $this->db->tableName('page'); + $searchindex = $this->db->tableName('searchindex'); + return 'SELECT page_id, page_namespace, page_title ' . + "FROM $page,$searchindex " . + 'WHERE page_id=si_page AND ' . $match; + } + + /** @todo document */ + function parseQuery($filteredText, $fulltext) { + global $wgContLang; + $lc = SearchEngine::legalSearchChars(); + $this->searchTerms = array(); + + # FIXME: This doesn't handle parenthetical expressions. + $m = array(); + $q = array(); + + if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $filteredText, $m, PREG_SET_ORDER)) { + foreach($m as $terms) { + + // Search terms in all variant forms, only + // apply on wiki with LanguageConverter + $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] ); + if( is_array( $temp_terms )) { + $temp_terms = array_unique( array_values( $temp_terms )); + foreach( $temp_terms as $t ) + $q[] = $terms[1] . $wgContLang->stripForSearch( $t ); + } + else + $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] ); + + if (!empty($terms[3])) { + $regexp = preg_quote( $terms[3], '/' ); + if ($terms[4]) + $regexp .= "[0-9A-Za-z_]+"; + } else { + $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); + } + $this->searchTerms[] = $regexp; + } + } + + $searchon = $this->db->strencode(join(',', $q)); + $field = $this->getIndexField($fulltext); + + // requires Net Search Extender or equivalent + //return " CONTAINS($field, '$searchon') > 0 "; + + return " lcase($field) LIKE lcase('%$searchon%')"; + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update($id, $title, $text) { + $dbw = wfGetDB(DB_MASTER); + $dbw->replace('searchindex', + array('si_page'), + array( + 'si_page' => $id, + 'si_title' => $title, + 'si_text' => $text + ), 'SearchIBM_DB2::update' ); + // ? + //$dbw->query("CALL ctx_ddl.sync_index('si_text_idx')"); + //$dbw->query("CALL ctx_ddl.sync_index('si_title_idx')"); + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * + * @param $id Integer + * @param $title String + */ + function updateTitle($id, $title) { + $dbw = wfGetDB(DB_MASTER); + + $dbw->update('searchindex', + array('si_title' => $title), + array('si_page' => $id), + 'SearchIBM_DB2::updateTitle', + array()); + } +} + +/** + * @ingroup Search + */ +class IBM_DB2SearchResultSet extends SearchResultSet { + function __construct($resultSet, $terms) { + $this->mResultSet = $resultSet; + $this->mTerms = $terms; + } + + function termMatches() { + return $this->mTerms; + } + + function numRows() { + return $this->mResultSet->numRows(); + } + + function next() { + $row = $this->mResultSet->fetchObject(); + if ($row === false) + return false; + return new SearchResult($row); + } +} diff --git a/includes/search/SearchMySQL.php b/includes/search/SearchMySQL.php new file mode 100644 index 0000000000..a0ec92711a --- /dev/null +++ b/includes/search/SearchMySQL.php @@ -0,0 +1,353 @@ + +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook for MySQL 4+ + * @ingroup Search + */ +class SearchMySQL extends SearchEngine { + var $strictMatching = true; + + /** @todo document */ + function __construct( $db ) { + $this->db = $db; + } + + /** + * Parse the user's query and transform it into an SQL fragment which will + * become part of a WHERE clause + */ + function parseQuery( $filteredText, $fulltext ) { + global $wgContLang; + $lc = SearchEngine::legalSearchChars(); // Minus format chars + $searchon = ''; + $this->searchTerms = array(); + + # FIXME: This doesn't handle parenthetical expressions. + $m = array(); + if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $filteredText, $m, PREG_SET_ORDER ) ) { + foreach( $m as $bits ) { + @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; + + if( $nonQuoted != '' ) { + $term = $nonQuoted; + $quote = ''; + } else { + $term = str_replace( '"', '', $term ); + $quote = '"'; + } + + if( $searchon !== '' ) $searchon .= ' '; + if( $this->strictMatching && ($modifier == '') ) { + // If we leave this out, boolean op defaults to OR which is rarely helpful. + $modifier = '+'; + } + + // Some languages such as Serbian store the input form in the search index, + // so we may need to search for matches in multiple writing system variants. + $convertedVariants = $wgContLang->autoConvertToAllVariants( $term ); + if( is_array( $convertedVariants ) ) { + $variants = array_unique( array_values( $convertedVariants ) ); + } else { + $variants = array( $term ); + } + + // The low-level search index does some processing on input to work + // around problems with minimum lengths and encoding in MySQL's + // fulltext engine. + // For Chinese this also inserts spaces between adjacent Han characters. + $strippedVariants = array_map( + array( $wgContLang, 'stripForSearch' ), + $variants ); + + // Some languages such as Chinese force all variants to a canonical + // form when stripping to the low-level search index, so to be sure + // let's check our variants list for unique items after stripping. + $strippedVariants = array_unique( $strippedVariants ); + + $searchon .= $modifier; + if( count( $strippedVariants) > 1 ) + $searchon .= '('; + foreach( $strippedVariants as $stripped ) { + if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { + // Hack for Chinese: we need to toss in quotes for + // multiple-character phrases since stripForSearch() + // added spaces between them to make word breaks. + $stripped = '"' . trim( $stripped ) . '"'; + } + $searchon .= "$quote$stripped$quote$wildcard "; + } + if( count( $strippedVariants) > 1 ) + $searchon .= ')'; + + // Match individual terms or quoted phrase in result highlighting... + // Note that variants will be introduced in a later stage for highlighting! + $regexp = $this->regexTerm( $term, $wildcard ); + $this->searchTerms[] = $regexp; + } + wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); + wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); + } else { + wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); + } + + $searchon = $this->db->strencode( $searchon ); + $field = $this->getIndexField( $fulltext ); + return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) "; + } + + function regexTerm( $string, $wildcard ) { + global $wgContLang; + + $regex = preg_quote( $string, '/' ); + if( $wgContLang->hasWordBreaks() ) { + if( $wildcard ) { + // Don't cut off the final bit! + $regex = "\b$regex"; + } else { + $regex = "\b$regex\b"; + } + } else { + // For Chinese, words may legitimately abut other words in the text literal. + // Don't add \b boundary checks... note this could cause false positives + // for latin chars. + } + return $regex; + } + + public static function legalSearchChars() { + return "\"*" . parent::legalSearchChars(); + } + + /** + * Perform a full text search query and return a result set. + * + * @param $term String: raw search term + * @return MySQLSearchResultSet + */ + function searchText( $term ) { + return $this->searchInternal( $term, true ); + } + + /** + * Perform a title-only search query and return a result set. + * + * @param $term String: raw search term + * @return MySQLSearchResultSet + */ + function searchTitle( $term ) { + return $this->searchInternal( $term, false ); + } + + protected function searchInternal( $term, $fulltext ) { + global $wgSearchMySQLTotalHits; + + $filteredTerm = $this->filter( $term ); + $resultSet = $this->db->query( $this->getQuery( $filteredTerm, $fulltext ) ); + + $total = null; + if( $wgSearchMySQLTotalHits ) { + $totalResult = $this->db->query( $this->getCountQuery( $filteredTerm, $fulltext ) ); + $row = $totalResult->fetchObject(); + if( $row ) { + $total = intval( $row->c ); + } + $totalResult->free(); + } + + return new MySQLSearchResultSet( $resultSet, $this->searchTerms, $total ); + } + + + /** + * Return a partial WHERE clause to exclude redirects, if so set + * @return String + */ + function queryRedirect() { + if( $this->showRedirects ) { + return ''; + } else { + return 'AND page_is_redirect=0'; + } + } + + /** + * Return a partial WHERE clause to limit the search to the given namespaces + * @return String + */ + function queryNamespaces() { + if( is_null($this->namespaces) ) + return ''; # search all + if ( !count( $this->namespaces ) ) { + $namespaces = '0'; + } else { + $namespaces = $this->db->makeList( $this->namespaces ); + } + return 'AND page_namespace IN (' . $namespaces . ')'; + } + + /** + * Return a LIMIT clause to limit results on the query. + * @return String + */ + function queryLimit() { + return $this->db->limitResult( '', $this->limit, $this->offset ); + } + + /** + * Does not do anything for generic search engine + * subclasses may define this though + * @return String + */ + function queryRanking( $filteredTerm, $fulltext ) { + return ''; + } + + /** + * Construct the full SQL query to do the search. + * The guts shoulds be constructed in queryMain() + * @param $filteredTerm String + * @param $fulltext Boolean + */ + function getQuery( $filteredTerm, $fulltext ) { + return $this->queryMain( $filteredTerm, $fulltext ) . ' ' . + $this->queryRedirect() . ' ' . + $this->queryNamespaces() . ' ' . + $this->queryRanking( $filteredTerm, $fulltext ) . ' ' . + $this->queryLimit(); + } + + /** + * Picks which field to index on, depending on what type of query. + * @param $fulltext Boolean + * @return String + */ + function getIndexField( $fulltext ) { + return $fulltext ? 'si_text' : 'si_title'; + } + + /** + * Get the base part of the search query. + * The actual match syntax will depend on the server + * version; MySQL 3 and MySQL 4 have different capabilities + * in their fulltext search indexes. + * + * @param $filteredTerm String + * @param $fulltext Boolean + * @return String + */ + function queryMain( $filteredTerm, $fulltext ) { + $match = $this->parseQuery( $filteredTerm, $fulltext ); + $page = $this->db->tableName( 'page' ); + $searchindex = $this->db->tableName( 'searchindex' ); + return 'SELECT page_id, page_namespace, page_title ' . + "FROM $page,$searchindex " . + 'WHERE page_id=si_page AND ' . $match; + } + + function getCountQuery( $filteredTerm, $fulltext ) { + $match = $this->parseQuery( $filteredTerm, $fulltext ); + $page = $this->db->tableName( 'page' ); + $searchindex = $this->db->tableName( 'searchindex' ); + return "SELECT COUNT(*) AS c " . + "FROM $page,$searchindex " . + 'WHERE page_id=si_page AND ' . $match . + $this->queryRedirect() . ' ' . + $this->queryNamespaces(); + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update( $id, $title, $text ) { + $dbw = wfGetDB( DB_MASTER ); + $dbw->replace( 'searchindex', + array( 'si_page' ), + array( + 'si_page' => $id, + 'si_title' => $title, + 'si_text' => $text + ), __METHOD__ ); + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * + * @param $id Integer + * @param $title String + */ + function updateTitle( $id, $title ) { + $dbw = wfGetDB( DB_MASTER ); + + $dbw->update( 'searchindex', + array( 'si_title' => $title ), + array( 'si_page' => $id ), + __METHOD__, + array( $dbw->lowPriorityOption() ) ); + } +} + +/** + * @ingroup Search + */ +class MySQLSearchResultSet extends SearchResultSet { + function MySQLSearchResultSet( $resultSet, $terms, $totalHits=null ) { + $this->mResultSet = $resultSet; + $this->mTerms = $terms; + $this->mTotalHits = $totalHits; + } + + function termMatches() { + return $this->mTerms; + } + + function numRows() { + return $this->mResultSet->numRows(); + } + + function next() { + $row = $this->mResultSet->fetchObject(); + if( $row === false ) { + return false; + } else { + return new SearchResult( $row ); + } + } + + function free() { + $this->mResultSet->free(); + } + + + function getTotalHits() { + return $this->mTotalHits; + } +} \ No newline at end of file diff --git a/includes/search/SearchMySQL4.php b/includes/search/SearchMySQL4.php new file mode 100644 index 0000000000..3e2bb2d1dd --- /dev/null +++ b/includes/search/SearchMySQL4.php @@ -0,0 +1,34 @@ + +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook for MySQL 4+ + * This class retained for backwards compatibility... + * The meat's been moved to SearchMySQL, since the 3.x variety is gone. + * @ingroup Search + * @deprecated + */ +class SearchMySQL4 extends SearchMySQL { + /* whee */ +} diff --git a/includes/search/SearchOracle.php b/includes/search/SearchOracle.php new file mode 100644 index 0000000000..3cd91faab8 --- /dev/null +++ b/includes/search/SearchOracle.php @@ -0,0 +1,259 @@ + +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook base class for Oracle (ConText). + * @ingroup Search + */ +class SearchOracle extends SearchEngine { + function __construct($db) { + $this->db = $db; + } + + /** + * Perform a full text search query and return a result set. + * + * @param $term String: raw search term + * @return OracleSearchResultSet + */ + function searchText( $term ) { + if ($term == '') + return new OracleSearchResultSet(false, ''); + + $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), true))); + return new OracleSearchResultSet($resultSet, $this->searchTerms); + } + + /** + * Perform a title-only search query and return a result set. + * + * @param $term String: raw search term + * @return ORacleSearchResultSet + */ + function searchTitle($term) { + if ($term == '') + return new OracleSearchResultSet(false, ''); + + $resultSet = $this->db->resultObject($this->db->query($this->getQuery($this->filter($term), false))); + return new MySQLSearchResultSet($resultSet, $this->searchTerms); + } + + + /** + * Return a partial WHERE clause to exclude redirects, if so set + * @return String + */ + function queryRedirect() { + if ($this->showRedirects) { + return ''; + } else { + return 'AND page_is_redirect=0'; + } + } + + /** + * Return a partial WHERE clause to limit the search to the given namespaces + * @return String + */ + function queryNamespaces() { + if( is_null($this->namespaces) ) + return ''; + if ( !count( $this->namespaces ) ) { + $namespaces = '0'; + } else { + $namespaces = $this->db->makeList( $this->namespaces ); + } + return 'AND page_namespace IN (' . $namespaces . ')'; + } + + /** + * Return a LIMIT clause to limit results on the query. + * @return String + */ + function queryLimit($sql) { + return $this->db->limitResult($sql, $this->limit, $this->offset); + } + + /** + * Does not do anything for generic search engine + * subclasses may define this though + * @return String + */ + function queryRanking($filteredTerm, $fulltext) { + return ' ORDER BY score(1)'; + } + + /** + * Construct the full SQL query to do the search. + * The guts shoulds be constructed in queryMain() + * @param $filteredTerm String + * @param $fulltext Boolean + */ + function getQuery( $filteredTerm, $fulltext ) { + return $this->queryLimit($this->queryMain($filteredTerm, $fulltext) . ' ' . + $this->queryRedirect() . ' ' . + $this->queryNamespaces() . ' ' . + $this->queryRanking( $filteredTerm, $fulltext ) . ' '); + } + + + /** + * Picks which field to index on, depending on what type of query. + * @param $fulltext Boolean + * @return String + */ + function getIndexField($fulltext) { + return $fulltext ? 'si_text' : 'si_title'; + } + + /** + * Get the base part of the search query. + * + * @param $filteredTerm String + * @param $fulltext Boolean + * @return String + */ + function queryMain( $filteredTerm, $fulltext ) { + $match = $this->parseQuery($filteredTerm, $fulltext); + $page = $this->db->tableName('page'); + $searchindex = $this->db->tableName('searchindex'); + return 'SELECT page_id, page_namespace, page_title ' . + "FROM $page,$searchindex " . + 'WHERE page_id=si_page AND ' . $match; + } + + /** + * Parse a user input search string, and return an SQL fragment to be used + * as part of a WHERE clause + */ + function parseQuery($filteredText, $fulltext) { + global $wgContLang; + $lc = SearchEngine::legalSearchChars(); + $this->searchTerms = array(); + + # FIXME: This doesn't handle parenthetical expressions. + $m = array(); + $q = array(); + + if (preg_match_all('/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $filteredText, $m, PREG_SET_ORDER)) { + foreach($m as $terms) { + + // Search terms in all variant forms, only + // apply on wiki with LanguageConverter + $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] ); + if( is_array( $temp_terms )) { + $temp_terms = array_unique( array_values( $temp_terms )); + foreach( $temp_terms as $t ) + $q[] = $terms[1] . $wgContLang->stripForSearch( $t ); + } + else + $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] ); + + if (!empty($terms[3])) { + $regexp = preg_quote( $terms[3], '/' ); + if ($terms[4]) + $regexp .= "[0-9A-Za-z_]+"; + } else { + $regexp = preg_quote(str_replace('"', '', $terms[2]), '/'); + } + $this->searchTerms[] = $regexp; + } + } + + $searchon = $this->db->addQuotes(join(',', $q)); + $field = $this->getIndexField($fulltext); + return " CONTAINS($field, $searchon, 1) > 0 "; + } + + /** + * Create or update the search index record for the given page. + * Title and text should be pre-processed. + * + * @param $id Integer + * @param $title String + * @param $text String + */ + function update($id, $title, $text) { + $dbw = wfGetDB(DB_MASTER); + $dbw->replace('searchindex', + array('si_page'), + array( + 'si_page' => $id, + 'si_title' => $title, + 'si_text' => $text + ), 'SearchOracle::update' ); + $dbw->query("CALL ctx_ddl.sync_index('si_text_idx')"); + $dbw->query("CALL ctx_ddl.sync_index('si_title_idx')"); + } + + /** + * Update a search index record's title only. + * Title should be pre-processed. + * + * @param int $id + * @param string $title + */ + function updateTitle($id, $title) { + $dbw = wfGetDB(DB_MASTER); + + $dbw->update('searchindex', + array('si_title' => $title), + array('si_page' => $id), + 'SearchOracle::updateTitle', + array()); + } +} + +/** + * @ingroup Search + */ +class OracleSearchResultSet extends SearchResultSet { + + function __construct($resultSet, $terms) { + $this->mResultSet = $resultSet; + $this->mTerms = $terms; + } + + function termMatches() { + return $this->mTerms; + } + + function numRows() { + if ($this->mResultSet === false ) + return 0; + else + return $this->mResultSet->numRows(); + } + + function next() { + if ($this->mResultSet === false ) + return false; + + $row = $this->mResultSet->fetchObject(); + if ($row === false) + return false; + return new SearchResult($row); + } +} diff --git a/includes/search/SearchPostgres.php b/includes/search/SearchPostgres.php new file mode 100644 index 0000000000..81e9e65ca5 --- /dev/null +++ b/includes/search/SearchPostgres.php @@ -0,0 +1,255 @@ + +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +/** + * @file + * @ingroup Search + */ + +/** + * Search engine hook base class for Postgres + * @ingroup Search + */ +class SearchPostgres extends SearchEngine { + + function __construct( $db ) { + $this->db = $db; + } + + /** + * Perform a full text search query via tsearch2 and return a result set. + * Currently searches a page's current title (page.page_title) and + * latest revision article text (pagecontent.old_text) + * + * @param $term String: raw search term + * @return PostgresSearchResultSet + */ + function searchTitle( $term ) { + $q = $this->searchQuery( $term , 'titlevector', 'page_title' ); + $olderror = error_reporting(E_ERROR); + $resultSet = $this->db->resultObject( $this->db->query( $q, 'SearchPostgres', true ) ); + error_reporting($olderror); + if (!$resultSet) { + // Needed for "Query requires full scan, GIN doesn't support it" + return new SearchResultTooMany(); + } + return new PostgresSearchResultSet( $resultSet, $this->searchTerms ); + } + function searchText( $term ) { + $q = $this->searchQuery( $term, 'textvector', 'old_text' ); + $olderror = error_reporting(E_ERROR); + $resultSet = $this->db->resultObject( $this->db->query( $q, 'SearchPostgres', true ) ); + error_reporting($olderror); + if (!$resultSet) { + return new SearchResultTooMany(); + } + return new PostgresSearchResultSet( $resultSet, $this->searchTerms ); + } + + + /* + * Transform the user's search string into a better form for tsearch2 + * Returns an SQL fragment consisting of quoted text to search for. + */ + function parseQuery( $term ) { + + wfDebug( "parseQuery received: $term \n" ); + + ## No backslashes allowed + $term = preg_replace('/\\\/', '', $term); + + ## Collapse parens into nearby words: + $term = preg_replace('/\s*\(\s*/', ' (', $term); + $term = preg_replace('/\s*\)\s*/', ') ', $term); + + ## Treat colons as word separators: + $term = preg_replace('/:/', ' ', $term); + + $searchstring = ''; + $m = array(); + if( preg_match_all('/([-!]?)(\S+)\s*/', $term, $m, PREG_SET_ORDER ) ) { + foreach( $m as $terms ) { + if (strlen($terms[1])) { + $searchstring .= ' & !'; + } + if (strtolower($terms[2]) === 'and') { + $searchstring .= ' & '; + } + else if (strtolower($terms[2]) === 'or' or $terms[2] === '|') { + $searchstring .= ' | '; + } + else if (strtolower($terms[2]) === 'not') { + $searchstring .= ' & !'; + } + else { + $searchstring .= " & $terms[2]"; + } + } + } + + ## Strip out leading junk + $searchstring = preg_replace('/^[\s\&\|]+/', '', $searchstring); + + ## Remove any doubled-up operators + $searchstring = preg_replace('/([\!\&\|]) +(?:[\&\|] +)+/', "$1 ", $searchstring); + + ## Remove any non-spaced operators (e.g. "Zounds!") + $searchstring = preg_replace('/([^ ])[\!\&\|]/', "$1", $searchstring); + + ## Remove any trailing whitespace or operators + $searchstring = preg_replace('/[\s\!\&\|]+$/', '', $searchstring); + + ## Remove unnecessary quotes around everything + $searchstring = preg_replace('/^[\'"](.*)[\'"]$/', "$1", $searchstring); + + ## Quote the whole thing + $searchstring = $this->db->addQuotes($searchstring); + + wfDebug( "parseQuery returned: $searchstring \n" ); + + return $searchstring; + + } + + /** + * Construct the full SQL query to do the search. + * @param $filteredTerm String + * @param $fulltext String + */ + function searchQuery( $term, $fulltext, $colname ) { + global $wgDBversion; + + if ( !isset( $wgDBversion ) ) { + $this->db->getServerVersion(); + $wgDBversion = $this->db->numeric_version; + } + $prefix = $wgDBversion < 8.3 ? "'default'," : ''; + + # Get the SQL fragment for the given term + $searchstring = $this->parseQuery( $term ); + + ## We need a separate query here so gin does not complain about empty searches + $SQL = "SELECT to_tsquery($prefix $searchstring)"; + $res = $this->db->doQuery($SQL); + if (!$res) { + ## TODO: Better output (example to catch: one 'two) + die ("Sorry, that was not a valid search string. Please go back and try again"); + } + $top = pg_fetch_result($res,0,0); + + if ($top === "") { ## e.g. if only stopwords are used XXX return something better + $query = "SELECT page_id, page_namespace, page_title, 0 AS score ". + "FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " . + "AND r.rev_text_id = c.old_id AND 1=0"; + } + else { + $m = array(); + if( preg_match_all("/'([^']+)'/", $top, $m, PREG_SET_ORDER ) ) { + foreach( $m as $terms ) { + $this->searchTerms[$terms[1]] = $terms[1]; + } + } + + $rankscore = $wgDBversion > 8.2 ? 5 : 1; + $rank = $wgDBversion < 8.3 ? 'rank' : 'ts_rank'; + $query = "SELECT page_id, page_namespace, page_title, ". + "$rank($fulltext, to_tsquery($prefix $searchstring), $rankscore) AS score ". + "FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " . + "AND r.rev_text_id = c.old_id AND $fulltext @@ to_tsquery($prefix $searchstring)"; + } + + ## Redirects + if (! $this->showRedirects) + $query .= ' AND page_is_redirect = 0'; + + ## Namespaces - defaults to 0 + if( !is_null($this->namespaces) ){ // null -> search all + if ( count($this->namespaces) < 1) + $query .= ' AND page_namespace = 0'; + else { + $namespaces = $this->db->makeList( $this->namespaces ); + $query .= " AND page_namespace IN ($namespaces)"; + } + } + + $query .= " ORDER BY score DESC, page_id DESC"; + + $query .= $this->db->limitResult( '', $this->limit, $this->offset ); + + wfDebug( "searchQuery returned: $query \n" ); + + return $query; + } + + ## Most of the work of these two functions are done automatically via triggers + + function update( $pageid, $title, $text ) { + ## We don't want to index older revisions + $SQL = "UPDATE pagecontent SET textvector = NULL WHERE old_id IN ". + "(SELECT rev_text_id FROM revision WHERE rev_page = " . intval( $pageid ) . + " ORDER BY rev_text_id DESC OFFSET 1)"; + $this->db->doQuery($SQL); + return true; + } + + function updateTitle( $id, $title ) { + return true; + } + +} ## end of the SearchPostgres class + +/** + * @ingroup Search + */ +class PostgresSearchResult extends SearchResult { + function __construct( $row ) { + parent::__construct($row); + $this->score = $row->score; + } + function getScore() { + return $this->score; + } +} + +/** + * @ingroup Search + */ +class PostgresSearchResultSet extends SearchResultSet { + function __construct( $resultSet, $terms ) { + $this->mResultSet = $resultSet; + $this->mTerms = $terms; + } + + function termMatches() { + return $this->mTerms; + } + + function numRows() { + return $this->mResultSet->numRows(); + } + + function next() { + $row = $this->mResultSet->fetchObject(); + if( $row === false ) { + return false; + } else { + return new PostgresSearchResult( $row ); + } + } +} diff --git a/includes/search/SearchUpdate.php b/includes/search/SearchUpdate.php new file mode 100644 index 0000000000..087a8ba5dc --- /dev/null +++ b/includes/search/SearchUpdate.php @@ -0,0 +1,113 @@ +mId = $id; + $this->mText = $text; + + $this->mNamespace = $nt->getNamespace(); + $this->mTitle = $nt->getText(); # Discard namespace + + $this->mTitleWords = $this->mTextWords = array(); + } else { + wfDebug( "SearchUpdate object created with invalid title '$title'\n" ); + } + } + + function doUpdate() { + global $wgContLang, $wgDisableSearchUpdate; + + if( $wgDisableSearchUpdate || !$this->mId ) { + return false; + } + $fname = 'SearchUpdate::doUpdate'; + wfProfileIn( $fname ); + + $search = SearchEngine::create(); + $lc = SearchEngine::legalSearchChars() . '&#;'; + + if( $this->mText === false ) { + $search->updateTitle($this->mId, + Title::indexTitle( $this->mNamespace, $this->mTitle )); + wfProfileOut( $fname ); + return; + } + + # Language-specific strip/conversion + $text = $wgContLang->stripForSearch( $this->mText ); + + wfProfileIn( $fname.'-regexps' ); + $text = preg_replace( "/<\\/?\\s*[A-Za-z][A-Za-z0-9]*\\s*([^>]*?)>/", + ' ', strtolower( " " . $text /*$this->mText*/ . " " ) ); # Strip HTML markup + $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", + "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings + + # Strip external URLs + $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\xA0-\\xFF"; + $protos = "http|https|ftp|mailto|news|gopher"; + $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; + $text = preg_replace( $pat, "\\1 \\3", $text ); + + $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; + $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; + $text = preg_replace( $p1, "\\1 ", $text ); + $text = preg_replace( $p2, "\\1 \\3 ", $text ); + + # Internal image links + $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; + $text = preg_replace( $pat2, " \\1 \\3", $text ); + + $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", + "\\1\\2 \\2\\3", $text ); # Handle [[game]]s + + # Strip all remaining non-search characters + $text = preg_replace( "/[^{$lc}]+/", " ", $text ); + + # Handle 's, s' + # + # $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); + # $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); + # + # These tail-anchored regexps are insanely slow. The worst case comes + # when Japanese or Chinese text (ie, no word spacing) is written on + # a wiki configured for Western UTF-8 mode. The Unicode characters are + # expanded to hex codes and the "words" are very long paragraph-length + # monstrosities. On a large page the above regexps may take over 20 + # seconds *each* on a 1GHz-level processor. + # + # Following are reversed versions which are consistently fast + # (about 3 milliseconds on 1GHz-level processor). + # + $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); + $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); + + # Strip wiki '' and ''' + $text = preg_replace( "/''[']*/", " ", $text ); + wfProfileOut( "$fname-regexps" ); + + wfRunHooks( 'SearchUpdate', array( $this->mId, $this->mNamespace, $this->mTitle, &$text ) ); + + # Perform the actual update + $search->update($this->mId, Title::indexTitle( $this->mNamespace, $this->mTitle ), + $text); + + wfProfileOut( $fname ); + } +} + +/** + * Placeholder class + * @ingroup Search + */ +class SearchUpdateMyISAM extends SearchUpdate { + # Inherits everything +}