includes/search/SearchEngine.php

   1 <?php
   2 /**
   3  * Basic search engine
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * @defgroup Search Search
  11  */
  12
  13 /**
  14  * Contain a class for special pages
  15  * @ingroup Search
  16  */
  17 class SearchEngine {
  18         var $limit = 10;
  19         var $offset = 0;
  20         var $prefix = '';
  21         var $searchTerms = array();
  22         var $namespaces = array( NS_MAIN );
  23         var $showRedirects = false;
  24
  25         /**
  26          * Perform a full text search query and return a result set.
  27          * If title searches are not supported or disabled, return null.
  28          * STUB
  29          *
  30          * @param $term String: raw search term
  31          * @return SearchResultSet
  32          */
  33         function searchText( $term ) {
  34                 return null;
  35         }
  36
  37         /**
  38          * Perform a title-only search query and return a result set.
  39          * If title searches are not supported or disabled, return null.
  40          * STUB
  41          *
  42          * @param $term String: raw search term
  43          * @return SearchResultSet
  44          */
  45         function searchTitle( $term ) {
  46                 return null;
  47         }
  48
  49         /** If this search backend can list/unlist redirects */
  50         function acceptListRedirects() {
  51                 return true;
  52         }
  53
  54         /**
  55          * When overridden in derived class, performs database-specific conversions
  56          * on text to be used for searching or updating search index.
  57          * Default implementation does nothing (simply returns $string).
  58          *
  59          * @param $string string: String to process
  60          * @return string
  61          */
  62         public function normalizeText( $string ) {
  63                 global $wgContLang;
  64
  65                 // Some languages such as Chinese require word segmentation
  66                 return $wgContLang->segmentByWord( $string );
  67         }
  68
  69         /**
  70          * Transform search term in cases when parts of the query came as different GET params (when supported)
  71          * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
  72          */
  73         function transformSearchTerm( $term ) {
  74                 return $term;
  75         }
  76
  77         /**
  78          * If an exact title match can be found, or a very slightly close match,
  79          * return the title. If no match, returns NULL.
  80          *
  81          * @param $searchterm String
  82          * @return Title
  83          */
  84         public static function getNearMatch( $searchterm ) {
  85                 $title = self::getNearMatchInternal( $searchterm );
  86
  87                 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
  88                 return $title;
  89         }
  90
  91         /**
  92          * Do a near match (see SearchEngine::getNearMatch) and wrap it into a
  93          * SearchResultSet.
  94          *
  95          * @param $searchterm string
  96          * @return SearchResultSet
  97          */
  98         public static function getNearMatchResultSet( $searchterm ) {
  99                 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) );
 100         }
 101
 102         /**
 103          * Really find the title match.
 104          */
 105         private static function getNearMatchInternal( $searchterm ) {
 106                 global $wgContLang;
 107
 108                 $allSearchTerms = array( $searchterm );
 109
 110                 if ( $wgContLang->hasVariants() ) {
 111                         $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->convertLinkToAllVariants( $searchterm ) );
 112                 }
 113
 114                 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
 115                         return $titleResult;
 116                 }
 117
 118                 foreach ( $allSearchTerms as $term ) {
 119
 120                         # Exact match? No need to look further.
 121                         $title = Title::newFromText( $term );
 122                         if ( is_null( $title ) )
 123                                 return null;
 124
 125                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
 126                                 return $title;
 127                         }
 128
 129                         # See if it still otherwise has content is some sane sense
 130                         $article = MediaWiki::articleFromTitle( $title );
 131                         if ( $article->hasViewableContent() ) {
 132                                 return $title;
 133                         }
 134
 135                         # Now try all lower case (i.e. first letter capitalized)
 136                         #
 137                         $title = Title::newFromText( $wgContLang->lc( $term ) );
 138                         if ( $title && $title->exists() ) {
 139                                 return $title;
 140                         }
 141
 142                         # Now try capitalized string
 143                         #
 144                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
 145                         if ( $title && $title->exists() ) {
 146                                 return $title;
 147                         }
 148
 149                         # Now try all upper case
 150                         #
 151                         $title = Title::newFromText( $wgContLang->uc( $term ) );
 152                         if ( $title && $title->exists() ) {
 153                                 return $title;
 154                         }
 155
 156                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 157                         $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) );
 158                         if ( $title && $title->exists() ) {
 159                                 return $title;
 160                         }
 161
 162                         // Give hooks a chance at better match variants
 163                         $title = null;
 164                         if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 165                                 return $title;
 166                         }
 167                 }
 168
 169                 $title = Title::newFromText( $searchterm );
 170
 171                 # Entering an IP address goes to the contributions page
 172                 if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) )
 173                         || User::isIP( trim( $searchterm ) ) ) {
 174                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 175                 }
 176
 177
 178                 # Entering a user goes to the user page whether it's there or not
 179                 if ( $title->getNamespace() == NS_USER ) {
 180                         return $title;
 181                 }
 182
 183                 # Go to images that exist even if there's no local page.
 184                 # There may have been a funny upload, or it may be on a shared
 185                 # file repository such as Wikimedia Commons.
 186                 if ( $title->getNamespace() == NS_FILE ) {
 187                         $image = wfFindFile( $title );
 188                         if ( $image ) {
 189                                 return $title;
 190                         }
 191                 }
 192
 193                 # MediaWiki namespace? Page may be "implied" if not customized.
 194                 # Just return it, with caps forced as the message system likes it.
 195                 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
 196                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 197                 }
 198
 199                 # Quoted term? Try without the quotes...
 200                 $matches = array();
 201                 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 202                         return SearchEngine::getNearMatch( $matches[1] );
 203                 }
 204
 205                 return null;
 206         }
 207
 208         public static function legalSearchChars() {
 209                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
 210         }
 211
 212         /**
 213          * Set the maximum number of results to return
 214          * and how many to skip before returning the first.
 215          *
 216          * @param $limit Integer
 217          * @param $offset Integer
 218          */
 219         function setLimitOffset( $limit, $offset = 0 ) {
 220                 $this->limit = intval( $limit );
 221                 $this->offset = intval( $offset );
 222         }
 223
 224         /**
 225          * Set which namespaces the search should include.
 226          * Give an array of namespace index numbers.
 227          *
 228          * @param $namespaces Array
 229          */
 230         function setNamespaces( $namespaces ) {
 231                 $this->namespaces = $namespaces;
 232         }
 233
 234         /**
 235          * Parse some common prefixes: all (search everything)
 236          * or namespace names
 237          *
 238          * @param $query String
 239          */
 240         function replacePrefixes( $query ) {
 241                 global $wgContLang;
 242
 243                 $parsed = $query;
 244                 if ( strpos( $query, ':' ) === false ) { // nothing to do
 245                         wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
 246                         return $parsed;
 247                 }
 248
 249                 $allkeyword = wfMsgForContent( 'searchall' ) . ":";
 250                 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) {
 251                         $this->namespaces = null;
 252                         $parsed = substr( $query, strlen( $allkeyword ) );
 253                 } else if ( strpos( $query, ':' ) !== false ) {
 254                         $prefix = substr( $query, 0, strpos( $query, ':' ) );
 255                         $index = $wgContLang->getNsIndex( $prefix );
 256                         if ( $index !== false ) {
 257                                 $this->namespaces = array( $index );
 258                                 $parsed = substr( $query, strlen( $prefix ) + 1 );
 259                         }
 260                 }
 261                 if ( trim( $parsed ) == '' )
 262                         $parsed = $query; // prefix was the whole query
 263
 264                 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
 265
 266                 return $parsed;
 267         }
 268
 269         /**
 270          * Make a list of searchable namespaces and their canonical names.
 271          * @return Array
 272          */
 273         public static function searchableNamespaces() {
 274                 global $wgContLang;
 275                 $arr = array();
 276                 foreach ( $wgContLang->getNamespaces() as $ns => $name ) {
 277                         if ( $ns >= NS_MAIN ) {
 278                                 $arr[$ns] = $name;
 279                         }
 280                 }
 281
 282                 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
 283                 return $arr;
 284         }
 285
 286         /**
 287          * Extract default namespaces to search from the given user's
 288          * settings, returning a list of index numbers.
 289          *
 290          * @param $user User
 291          * @return Array
 292          */
 293         public static function userNamespaces( $user ) {
 294                 global $wgSearchEverythingOnlyLoggedIn;
 295
 296                 // get search everything preference, that can be set to be read for logged-in users
 297                 $searcheverything = false;
 298                 if ( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
 299                     || !$wgSearchEverythingOnlyLoggedIn )
 300                         $searcheverything = $user->getOption( 'searcheverything' );
 301
 302                 // searcheverything overrides other options
 303                 if ( $searcheverything )
 304                         return array_keys( SearchEngine::searchableNamespaces() );
 305
 306                 $arr = Preferences::loadOldSearchNs( $user );
 307                 $searchableNamespaces = SearchEngine::searchableNamespaces();
 308
 309                 $arr = array_intersect( $arr, array_keys( $searchableNamespaces ) ); // Filter
 310
 311                 return $arr;
 312         }
 313
 314         /**
 315          * Find snippet highlight settings for a given user
 316          *
 317          * @param $user User
 318          * @return Array contextlines, contextchars
 319          */
 320         public static function userHighlightPrefs( &$user ) {
 321                 // $contextlines = $user->getOption( 'contextlines',  5 );
 322                 // $contextchars = $user->getOption( 'contextchars', 50 );
 323                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 324                 $contextchars = 75; // same as above.... :P
 325                 return array( $contextlines, $contextchars );
 326         }
 327
 328         /**
 329          * An array of namespaces indexes to be searched by default
 330          *
 331          * @return Array
 332          */
 333         public static function defaultNamespaces() {
 334                 global $wgNamespacesToBeSearchedDefault;
 335
 336                 return array_keys( $wgNamespacesToBeSearchedDefault, true );
 337         }
 338
 339         /**
 340          * Get a list of namespace names useful for showing in tooltips
 341          * and preferences
 342          *
 343          * @param $namespaces Array
 344          */
 345         public static function namespacesAsText( $namespaces ) {
 346                 global $wgContLang;
 347
 348                 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces );
 349                 foreach ( $formatted as $key => $ns ) {
 350                         if ( empty( $ns ) )
 351                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 352                 }
 353                 return $formatted;
 354         }
 355
 356         /**
 357          * Return the help namespaces to be shown on Special:Search
 358          *
 359          * @return Array
 360          */
 361         public static function helpNamespaces() {
 362                 global $wgNamespacesToBeSearchedHelp;
 363
 364                 return array_keys( $wgNamespacesToBeSearchedHelp, true );
 365         }
 366
 367         /**
 368          * Return a 'cleaned up' search string
 369          *
 370          * @param $text String
 371          * @return String
 372          */
 373         function filter( $text ) {
 374                 $lc = $this->legalSearchChars();
 375                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 376         }
 377         /**
 378          * Load up the appropriate search engine class for the currently
 379          * active database backend, and return a configured instance.
 380          *
 381          * @return SearchEngine
 382          */
 383         public static function create() {
 384                 global $wgSearchType;
 385                 $dbr = wfGetDB( DB_SLAVE );
 386                 if ( $wgSearchType ) {
 387                         $class = $wgSearchType;
 388                 } else {
 389                         $class = $dbr->getSearchEngine();
 390                 }
 391                 $search = new $class( $dbr );
 392                 $search->setLimitOffset( 0, 0 );
 393                 return $search;
 394         }
 395
 396         /**
 397          * Create or update the search index record for the given page.
 398          * Title and text should be pre-processed.
 399          * STUB
 400          *
 401          * @param $id Integer
 402          * @param $title String
 403          * @param $text String
 404          */
 405         function update( $id, $title, $text ) {
 406                 // no-op
 407         }
 408
 409         /**
 410          * Update a search index record's title only.
 411          * Title should be pre-processed.
 412          * STUB
 413          *
 414          * @param $id Integer
 415          * @param $title String
 416          */
 417         function updateTitle( $id, $title ) {
 418                 // no-op
 419         }
 420
 421         /**
 422          * Get OpenSearch suggestion template
 423          *
 424          * @return String
 425          */
 426         public static function getOpenSearchTemplate() {
 427                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 428                 if ( $wgOpenSearchTemplate )    {
 429                         return $wgOpenSearchTemplate;
 430                 } else {
 431                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
 432                         if ( !$ns ) $ns = "0";
 433                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace=' . $ns;
 434                 }
 435         }
 436
 437         /**
 438          * Get internal MediaWiki Suggest template
 439          *
 440          * @return String
 441          */
 442         public static function getMWSuggestTemplate() {
 443                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 444                 if ( $wgMWSuggestTemplate )
 445                         return $wgMWSuggestTemplate;
 446                 else
 447                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
 448         }
 449 }
 450
 451 /**
 452  * @ingroup Search
 453  */
 454 class SearchResultSet {
 455         /**
 456          * Fetch an array of regular expression fragments for matching
 457          * the search terms as parsed by this engine in a text extract.
 458          * STUB
 459          *
 460          * @return Array
 461          */
 462         function termMatches() {
 463                 return array();
 464         }
 465
 466         function numRows() {
 467                 return 0;
 468         }
 469
 470         /**
 471          * Return true if results are included in this result set.
 472          * STUB
 473          *
 474          * @return Boolean
 475          */
 476         function hasResults() {
 477                 return false;
 478         }
 479
 480         /**
 481          * Some search modes return a total hit count for the query
 482          * in the entire article database. This may include pages
 483          * in namespaces that would not be matched on the given
 484          * settings.
 485          *
 486          * Return null if no total hits number is supported.
 487          *
 488          * @return Integer
 489          */
 490         function getTotalHits() {
 491                 return null;
 492         }
 493
 494         /**
 495          * Some search modes return a suggested alternate term if there are
 496          * no exact hits. Returns true if there is one on this set.
 497          *
 498          * @return Boolean
 499          */
 500         function hasSuggestion() {
 501                 return false;
 502         }
 503
 504         /**
 505          * @return String: suggested query, null if none
 506          */
 507         function getSuggestionQuery() {
 508                 return null;
 509         }
 510
 511         /**
 512          * @return String: HTML highlighted suggested query, '' if none
 513          */
 514         function getSuggestionSnippet() {
 515                 return '';
 516         }
 517
 518         /**
 519          * Return information about how and from where the results were fetched,
 520          * should be useful for diagnostics and debugging
 521          *
 522          * @return String
 523          */
 524         function getInfo() {
 525                 return null;
 526         }
 527
 528         /**
 529          * Return a result set of hits on other (multiple) wikis associated with this one
 530          *
 531          * @return SearchResultSet
 532          */
 533         function getInterwikiResults() {
 534                 return null;
 535         }
 536
 537         /**
 538          * Check if there are results on other wikis
 539          *
 540          * @return Boolean
 541          */
 542         function hasInterwikiResults() {
 543                 return $this->getInterwikiResults() != null;
 544         }
 545
 546         /**
 547          * Fetches next search result, or false.
 548          * STUB
 549          *
 550          * @return SearchResult
 551          */
 552         function next() {
 553                 return false;
 554         }
 555
 556         /**
 557          * Frees the result set, if applicable.
 558          */
 559         function free() {
 560                 // ...
 561         }
 562 }
 563
 564 /**
 565  * This class is used for different SQL-based search engines shipped with MediaWiki
 566  */
 567 class SqlSearchResultSet extends SearchResultSet {
 568         function __construct( $resultSet, $terms ) {
 569                 $this->mResultSet = $resultSet;
 570                 $this->mTerms = $terms;
 571         }
 572
 573         function termMatches() {
 574                 return $this->mTerms;
 575         }
 576
 577         function numRows() {
 578                 if ( $this->mResultSet === false )
 579                         return false;
 580
 581                 return $this->mResultSet->numRows();
 582         }
 583
 584         function next() {
 585                 if ( $this->mResultSet === false )
 586                         return false;
 587
 588                 $row = $this->mResultSet->fetchObject();
 589                 if ( $row === false )
 590                         return false;
 591
 592                 return SearchResult::newFromRow( $row );
 593         }
 594
 595         function free() {
 596                 if ( $this->mResultSet === false )
 597                         return false;
 598
 599                 $this->mResultSet->free();
 600         }
 601 }
 602
 603 /**
 604  * @ingroup Search
 605  */
 606 class SearchResultTooMany {
 607         # # Some search engines may bail out if too many matches are found
 608 }
 609
 610
 611 /**
 612  * @todo Fixme: This class is horribly factored. It would probably be better to
 613  * have a useful base class to which you pass some standard information, then
 614  * let the fancy self-highlighters extend that.
 615  * @ingroup Search
 616  */
 617 class SearchResult {
 618         var $mRevision = null;
 619         var $mImage = null;
 620
 621         /**
 622          * Return a new SearchResult and initializes it with a title.
 623          *
 624          * @param $title Title
 625          * @return SearchResult
 626          */
 627         public static function newFromTitle( $title ) {
 628                 $result = new self();
 629                 $result->initFromTitle( $title );
 630                 return $result;
 631         }
 632         /**
 633          * Return a new SearchResult and initializes it with a row.
 634          *
 635          * @param $row object
 636          * @return SearchResult
 637          */
 638         public static function newFromRow( $row ) {
 639                 $result = new self();
 640                 $result->initFromRow( $row );
 641                 return $result;
 642         }
 643
 644         public function __construct( $row = null ) {
 645                 if ( !is_null( $row ) ) {
 646                         // Backwards compatibility with pre-1.17 callers
 647                         $this->initFromRow( $row );
 648                 }
 649         }
 650
 651         /**
 652          * Initialize from a database row. Makes a Title and passes that to
 653          * initFromTitle.
 654          *
 655          * @param $row object
 656          */
 657         protected function initFromRow( $row ) {
 658                 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) );
 659         }
 660
 661         /**
 662          * Initialize from a Title and if possible initializes a corresponding
 663          * Revision and File.
 664          *
 665          * @param $title Title
 666          */
 667         protected function initFromTitle( $title ) {
 668                 $this->mTitle = $title;
 669                 if ( !is_null( $this->mTitle ) ) {
 670                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 671                         if ( $this->mTitle->getNamespace() === NS_FILE )
 672                                 $this->mImage = wfFindFile( $this->mTitle );
 673                 }
 674         }
 675
 676         /**
 677          * Check if this is result points to an invalid title
 678          *
 679          * @return Boolean
 680          */
 681         function isBrokenTitle() {
 682                 if ( is_null( $this->mTitle ) )
 683                         return true;
 684                 return false;
 685         }
 686
 687         /**
 688          * Check if target page is missing, happens when index is out of date
 689          *
 690          * @return Boolean
 691          */
 692         function isMissingRevision() {
 693                 return !$this->mRevision && !$this->mImage;
 694         }
 695
 696         /**
 697          * @return Title
 698          */
 699         function getTitle() {
 700                 return $this->mTitle;
 701         }
 702
 703         /**
 704          * @return Double or null if not supported
 705          */
 706         function getScore() {
 707                 return null;
 708         }
 709
 710         /**
 711          * Lazy initialization of article text from DB
 712          */
 713         protected function initText() {
 714                 if ( !isset( $this->mText ) ) {
 715                         if ( $this->mRevision != null )
 716                                 $this->mText = $this->mRevision->getText();
 717                         else // TODO: can we fetch raw wikitext for commons images?
 718                                 $this->mText = '';
 719
 720                 }
 721         }
 722
 723         /**
 724          * @param $terms Array: terms to highlight
 725          * @return String: highlighted text snippet, null (and not '') if not supported
 726          */
 727         function getTextSnippet( $terms ) {
 728                 global $wgUser, $wgAdvancedSearchHighlighting;
 729                 $this->initText();
 730                 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser );
 731                 $h = new SearchHighlighter();
 732                 if ( $wgAdvancedSearchHighlighting )
 733                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 734                 else
 735                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 736         }
 737
 738         /**
 739          * @param $terms Array: terms to highlight
 740          * @return String: highlighted title, '' if not supported
 741          */
 742         function getTitleSnippet( $terms ) {
 743                 return '';
 744         }
 745
 746         /**
 747          * @param $terms Array: terms to highlight
 748          * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
 749          */
 750         function getRedirectSnippet( $terms ) {
 751                 return '';
 752         }
 753
 754         /**
 755          * @return Title object for the redirect to this page, null if none or not supported
 756          */
 757         function getRedirectTitle() {
 758                 return null;
 759         }
 760
 761         /**
 762          * @return string highlighted relevant section name, null if none or not supported
 763          */
 764         function getSectionSnippet() {
 765                 return '';
 766         }
 767
 768         /**
 769          * @return Title object (pagename+fragment) for the section, null if none or not supported
 770          */
 771         function getSectionTitle() {
 772                 return null;
 773         }
 774
 775         /**
 776          * @return String: timestamp
 777          */
 778         function getTimestamp() {
 779                 if ( $this->mRevision )
 780                         return $this->mRevision->getTimestamp();
 781                 else if ( $this->mImage )
 782                         return $this->mImage->getTimestamp();
 783                 return '';
 784         }
 785
 786         /**
 787          * @return Integer: number of words
 788          */
 789         function getWordCount() {
 790                 $this->initText();
 791                 return str_word_count( $this->mText );
 792         }
 793
 794         /**
 795          * @return Integer: size in bytes
 796          */
 797         function getByteSize() {
 798                 $this->initText();
 799                 return strlen( $this->mText );
 800         }
 801
 802         /**
 803          * @return Boolean if hit has related articles
 804          */
 805         function hasRelated() {
 806                 return false;
 807         }
 808
 809         /**
 810          * @return String: interwiki prefix of the title (return iw even if title is broken)
 811          */
 812         function getInterwikiPrefix() {
 813                 return '';
 814         }
 815 }
 816 /**
 817  * A SearchResultSet wrapper for SearchEngine::getNearMatch
 818  */
 819 class SearchNearMatchResultSet extends SearchResultSet {
 820         private $fetched = false;
 821         /**
 822          * @param $match mixed Title if matched, else null
 823          */
 824         public function __construct( $match ) {
 825                 $this->result = $match;
 826         }
 827         public function hasResult() {
 828                 return (bool)$this->result;
 829         }
 830         public function numRows() {
 831                 return $this->hasResults() ? 1 : 0;
 832         }
 833         public function next() {
 834                 if ( $this->fetched || !$this->result ) {
 835                         return false;
 836                 }
 837                 $this->fetched = true;
 838                 return SearchResult::newFromTitle( $this->result );
 839         }
 840 }
 841
 842 /**
 843  * Highlight bits of wikitext
 844  *
 845  * @ingroup Search
 846  */
 847 class SearchHighlighter {
 848         var $mCleanWikitext = true;
 849
 850         function SearchHighlighter( $cleanupWikitext = true ) {
 851                 $this->mCleanWikitext = $cleanupWikitext;
 852         }
 853
 854         /**
 855          * Default implementation of wikitext highlighting
 856          *
 857          * @param $text String
 858          * @param $terms Array: terms to highlight (unescaped)
 859          * @param $contextlines Integer
 860          * @param $contextchars Integer
 861          * @return String
 862          */
 863         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 864                 global $wgContLang;
 865                 global $wgSearchHighlightBoundaries;
 866                 $fname = __METHOD__;
 867
 868                 if ( $text == '' )
 869                         return '';
 870
 871                 // spli text into text + templates/links/tables
 872                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 873                 // first capture group is for detecting nested templates/links/tables/references
 874                 $endPatterns = array(
 875                         1 => '/(\{\{)|(\}\})/', // template
 876                         2 => '/(\[\[)|(\]\])/', // image
 877                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
 878
 879                 // FIXME: this should prolly be a hook or something
 880                 if ( function_exists( 'wfCite' ) ) {
 881                         $spat .= '|(<ref>)'; // references via cite extension
 882                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 883                 }
 884                 $spat .= '/';
 885                 $textExt = array(); // text extracts
 886                 $otherExt = array();  // other extracts
 887                 wfProfileIn( "$fname-split" );
 888                 $start = 0;
 889                 $textLen = strlen( $text );
 890                 $count = 0; // sequence number to maintain ordering
 891                 while ( $start < $textLen ) {
 892                         // find start of template/image/table
 893                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
 894                                 $epat = '';
 895                                 foreach ( $matches as $key => $val ) {
 896                                         if ( $key > 0 && $val[1] != - 1 ) {
 897                                                 if ( $key == 2 ) {
 898                                                         // see if this is an image link
 899                                                         $ns = substr( $val[0], 2, - 1 );
 900                                                         if ( $wgContLang->getNsIndex( $ns ) != NS_FILE )
 901                                                                 break;
 902
 903                                                 }
 904                                                 $epat = $endPatterns[$key];
 905                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 906                                                 $start = $val[1];
 907                                                 break;
 908                                         }
 909                                 }
 910                                 if ( $epat ) {
 911                                         // find end (and detect any nested elements)
 912                                         $level = 0;
 913                                         $offset = $start + 1;
 914                                         $found = false;
 915                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
 916                                                 if ( array_key_exists( 2, $endMatches ) ) {
 917                                                         // found end
 918                                                         if ( $level == 0 ) {
 919                                                                 $len = strlen( $endMatches[2][0] );
 920                                                                 $off = $endMatches[2][1];
 921                                                                 $this->splitAndAdd( $otherExt, $count,
 922                                                                         substr( $text, $start, $off + $len  - $start ) );
 923                                                                 $start = $off + $len;
 924                                                                 $found = true;
 925                                                                 break;
 926                                                         } else {
 927                                                                 // end of nested element
 928                                                                 $level -= 1;
 929                                                         }
 930                                                 } else {
 931                                                         // nested
 932                                                         $level += 1;
 933                                                 }
 934                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
 935                                         }
 936                                         if ( ! $found ) {
 937                                                 // couldn't find appropriate closing tag, skip
 938                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
 939                                                 $start += strlen( $matches[0][0] );
 940                                         }
 941                                         continue;
 942                                 }
 943                         }
 944                         // else: add as text extract
 945                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
 946                         break;
 947                 }
 948
 949                 $all = $textExt + $otherExt; // these have disjunct key sets
 950
 951                 wfProfileOut( "$fname-split" );
 952
 953                 // prepare regexps
 954                 foreach ( $terms as $index => $term ) {
 955                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 956                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
 957                                 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
 958                         } else {
 959                                 $terms[$index] = $term;
 960                         }
 961                 }
 962                 $anyterm = implode( '|', $terms );
 963                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
 964
 965                 // FIXME: a hack to scale contextchars, a correct solution
 966                 // would be to have contextchars actually be char and not byte
 967                 // length, and do proper utf-8 substrings and lengths everywhere,
 968                 // but PHP is making that very hard and unclean to implement :(
 969                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
 970                 $contextchars = intval( $contextchars * $scale );
 971
 972                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 973                 $patPost = "($wgSearchHighlightBoundaries|$)";
 974
 975                 $pat1 = "/(" . $phrase . ")/ui";
 976                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
 977
 978                 wfProfileIn( "$fname-extract" );
 979
 980                 $left = $contextlines;
 981
 982                 $snippets = array();
 983                 $offsets = array();
 984
 985                 // show beginning only if it contains all words
 986                 $first = 0;
 987                 $firstText = '';
 988                 foreach ( $textExt as $index => $line ) {
 989                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
 990                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 991                                 $first = $index;
 992                                 break;
 993                         }
 994                 }
 995                 if ( $firstText ) {
 996                         $succ = true;
 997                         // check if first text contains all terms
 998                         foreach ( $terms as $term ) {
 999                                 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
1000                                         $succ = false;
1001                                         break;
1002                                 }
1003                         }
1004                         if ( $succ ) {
1005                                 $snippets[$first] = $firstText;
1006                                 $offsets[$first] = 0;
1007                         }
1008                 }
1009                 if ( ! $snippets ) {
1010                         // match whole query on text
1011                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
1012                         // match whole query on templates/tables/images
1013                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
1014                         // match any words on text
1015                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
1016                         // match any words on templates/tables/images
1017                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
1018
1019                         ksort( $snippets );
1020                 }
1021
1022                 // add extra chars to each snippet to make snippets constant size
1023                 $extended = array();
1024                 if ( count( $snippets ) == 0 ) {
1025                         // couldn't find the target words, just show beginning of article
1026                         if ( array_key_exists( $first, $all ) ) {
1027                                 $targetchars = $contextchars * $contextlines;
1028                                 $snippets[$first] = '';
1029                                 $offsets[$first] = 0;
1030                         }
1031                 } else {
1032                         // if begin of the article contains the whole phrase, show only that !!
1033                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
1034                             && $offsets[$first] < $contextchars * 2 ) {
1035                                 $snippets = array ( $first => $snippets[$first] );
1036                         }
1037
1038                         // calc by how much to extend existing snippets
1039                         $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
1040                 }
1041
1042                 foreach ( $snippets as $index => $line ) {
1043                         $extended[$index] = $line;
1044                         $len = strlen( $line );
1045                         if ( $len < $targetchars - 20 ) {
1046                                 // complete this line
1047                                 if ( $len < strlen( $all[$index] ) ) {
1048                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
1049                                         $len = strlen( $extended[$index] );
1050                                 }
1051
1052                                 // add more lines
1053                                 $add = $index + 1;
1054                                 while ( $len < $targetchars - 20
1055                                        && array_key_exists( $add, $all )
1056                                        && !array_key_exists( $add, $snippets ) ) {
1057                                     $offsets[$add] = 0;
1058                                     $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
1059                                         $extended[$add] = $tt;
1060                                         $len += strlen( $tt );
1061                                         $add++;
1062                                 }
1063                         }
1064                 }
1065
1066                 // $snippets = array_map('htmlspecialchars', $extended);
1067                 $snippets = $extended;
1068                 $last = - 1;
1069                 $extract = '';
1070                 foreach ( $snippets as $index => $line ) {
1071                         if ( $last == - 1 )
1072                                 $extract .= $line; // first line
1073                         elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) )
1074                                 $extract .= " " . $line; // continous lines
1075                         else
1076                                 $extract .= '<b> ... </b>' . $line;
1077
1078                         $last = $index;
1079                 }
1080                 if ( $extract )
1081                         $extract .= '<b> ... </b>';
1082
1083                 $processed = array();
1084                 foreach ( $terms as $term ) {
1085                         if ( ! isset( $processed[$term] ) ) {
1086                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
1087                                 $extract = preg_replace( $pat3,
1088                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
1089                                 $processed[$term] = true;
1090                         }
1091                 }
1092
1093                 wfProfileOut( "$fname-extract" );
1094
1095                 return $extract;
1096         }
1097
1098         /**
1099          * Split text into lines and add it to extracts array
1100          *
1101          * @param $extracts Array: index -> $line
1102          * @param $count Integer
1103          * @param $text String
1104          */
1105         function splitAndAdd( &$extracts, &$count, $text ) {
1106                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
1107                 foreach ( $split as $line ) {
1108                         $tt = trim( $line );
1109                         if ( $tt )
1110                                 $extracts[$count++] = $tt;
1111                 }
1112         }
1113
1114         /**
1115          * Do manual case conversion for non-ascii chars
1116          *
1117          * @param $matches Array
1118          */
1119         function caseCallback( $matches ) {
1120                 global $wgContLang;
1121                 if ( strlen( $matches[0] ) > 1 ) {
1122                         return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
1123                 } else
1124                         return $matches[0];
1125         }
1126
1127         /**
1128          * Extract part of the text from start to end, but by
1129          * not chopping up words
1130          * @param $text String
1131          * @param $start Integer
1132          * @param $end Integer
1133          * @param $posStart Integer: (out) actual start position
1134          * @param $posEnd Integer: (out) actual end position
1135          * @return String
1136          */
1137         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
1138                 if ( $start != 0 )
1139                         $start = $this->position( $text, $start, 1 );
1140                 if ( $end >= strlen( $text ) )
1141                         $end = strlen( $text );
1142                 else
1143                         $end = $this->position( $text, $end );
1144
1145                 if ( !is_null( $posStart ) )
1146                         $posStart = $start;
1147                 if ( !is_null( $posEnd ) )
1148                         $posEnd = $end;
1149
1150                 if ( $end > $start )
1151                         return substr( $text, $start, $end - $start );
1152                 else
1153                         return '';
1154         }
1155
1156         /**
1157          * Find a nonletter near a point (index) in the text
1158          *
1159          * @param $text String
1160          * @param $point Integer
1161          * @param $offset Integer: offset to found index
1162          * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1163          */
1164         function position( $text, $point, $offset = 0 ) {
1165                 $tolerance = 10;
1166                 $s = max( 0, $point - $tolerance );
1167                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
1168                 $m = array();
1169                 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
1170                         return $m[0][1] + $s + $offset;
1171                 } else {
1172                         // check if point is on a valid first UTF8 char
1173                         $char = ord( $text[$point] );
1174                         while ( $char >= 0x80 && $char < 0xc0 ) {
1175                                 // skip trailing bytes
1176                                 $point++;
1177                                 if ( $point >= strlen( $text ) )
1178                                         return strlen( $text );
1179                                 $char = ord( $text[$point] );
1180                         }
1181                         return $point;
1182
1183                 }
1184         }
1185
1186         /**
1187          * Search extracts for a pattern, and return snippets
1188          *
1189          * @param $pattern String: regexp for matching lines
1190          * @param $extracts Array: extracts to search
1191          * @param $linesleft Integer: number of extracts to make
1192          * @param $contextchars Integer: length of snippet
1193          * @param $out Array: map for highlighted snippets
1194          * @param $offsets Array: map of starting points of snippets
1195          * @protected
1196          */
1197         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
1198                 if ( $linesleft == 0 )
1199                         return; // nothing to do
1200                 foreach ( $extracts as $index => $line ) {
1201                         if ( array_key_exists( $index, $out ) )
1202                                 continue; // this line already highlighted
1203
1204                         $m = array();
1205                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1206                                 continue;
1207
1208                         $offset = $m[0][1];
1209                         $len = strlen( $m[0][0] );
1210                         if ( $offset + $len < $contextchars )
1211                                 $begin = 0;
1212                         elseif ( $len > $contextchars )
1213                                 $begin = $offset;
1214                         else
1215                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
1216
1217                         $end = $begin + $contextchars;
1218
1219                         $posBegin = $begin;
1220                         // basic snippet from this line
1221                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
1222                         $offsets[$index] = $posBegin;
1223                         $linesleft--;
1224                         if ( $linesleft == 0 )
1225                                 return;
1226                 }
1227         }
1228
1229         /**
1230          * Basic wikitext removal
1231          * @protected
1232          */
1233         function removeWiki( $text ) {
1234                 $fname = __METHOD__;
1235                 wfProfileIn( $fname );
1236
1237                 // $text = preg_replace("/'{2,5}/", "", $text);
1238                 // $text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1239                 // $text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1240                 // $text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1241                 // $text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1242                 // $text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1243                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
1244                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
1245                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
1246                 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
1247                 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1248                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
1249                 $text = preg_replace( "/'''''/", "", $text );
1250                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
1251                 $text = preg_replace( "/''/", "", $text );
1252
1253                 wfProfileOut( $fname );
1254                 return $text;
1255         }
1256
1257         /**
1258          * callback to replace [[target|caption]] kind of links, if
1259          * the target is category or image, leave it
1260          *
1261          * @param $matches Array
1262          */
1263         function linkReplace( $matches ) {
1264                 $colon = strpos( $matches[1], ':' );
1265                 if ( $colon === false )
1266                         return $matches[2]; // replace with caption
1267                 global $wgContLang;
1268                 $ns = substr( $matches[1], 0, $colon );
1269                 $index = $wgContLang->getNsIndex( $ns );
1270                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) )
1271                         return $matches[0]; // return the whole thing
1272                 else
1273                         return $matches[2];
1274
1275         }
1276
1277         /**
1278      * Simple & fast snippet extraction, but gives completely unrelevant
1279      * snippets
1280      *
1281      * @param $text String
1282      * @param $terms Array
1283      * @param $contextlines Integer
1284      * @param $contextchars Integer
1285      * @return String
1286      */
1287     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1288         global $wgContLang;
1289         $fname = __METHOD__;
1290
1291         $lines = explode( "\n", $text );
1292
1293         $terms = implode( '|', $terms );
1294         $max = intval( $contextchars ) + 1;
1295         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1296
1297         $lineno = 0;
1298
1299         $extract = "";
1300         wfProfileIn( "$fname-extract" );
1301         foreach ( $lines as $line ) {
1302             if ( 0 == $contextlines ) {
1303                 break;
1304             }
1305             ++$lineno;
1306             $m = array();
1307             if ( ! preg_match( $pat1, $line, $m ) ) {
1308                 continue;
1309             }
1310             --$contextlines;
1311             $pre = $wgContLang->truncate( $m[1], - $contextchars );
1312
1313             if ( count( $m ) < 3 ) {
1314                 $post = '';
1315             } else {
1316                 $post = $wgContLang->truncate( $m[3], $contextchars );
1317             }
1318
1319             $found = $m[2];
1320
1321             $line = htmlspecialchars( $pre . $found . $post );
1322             $pat2 = '/(' . $terms . ")/i";
1323             $line = preg_replace( $pat2,
1324               "<span class='searchmatch'>\\1</span>", $line );
1325
1326             $extract .= "${line}\n";
1327         }
1328         wfProfileOut( "$fname-extract" );
1329
1330         return $extract;
1331     }
1332
1333 }
1334
1335 /**
1336  * Dummy class to be used when non-supported Database engine is present.
1337  * @todo Fixme: dummy class should probably try something at least mildly useful,
1338  * such as a LIKE search through titles.
1339  * @ingroup Search
1340  */
1341 class SearchEngineDummy extends SearchEngine {
1342         // no-op
1343 }