includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $searchTerms = array();
  17         var $namespaces = array( NS_MAIN );
  18         var $showRedirects = false;
  19
  20         /**
  21          * Perform a full text search query and return a result set.
  22          * If title searches are not supported or disabled, return null.
  23          *
  24          * @param string $term - Raw search term
  25          * @return SearchResultSet
  26          * @access public
  27          * @abstract
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          *
  37          * @param string $term - Raw search term
  38          * @return SearchResultSet
  39          * @access public
  40          * @abstract
  41          */
  42         function searchTitle( $term ) {
  43                 return null;
  44         }
  45
  46         /** If this search backend can list/unlist redirects */
  47         function acceptListRedirects() {
  48                 return true;
  49         }
  50
  51         /**
  52          * If an exact title match can be find, or a very slightly close match,
  53          * return the title. If no match, returns NULL.
  54          *
  55          * @param string $term
  56          * @return Title
  57          */
  58         public static function getNearMatch( $searchterm ) {
  59                 global $wgContLang;
  60
  61                 $allSearchTerms = array($searchterm);
  62
  63                 if($wgContLang->hasVariants()){
  64                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  65                 }
  66
  67                 foreach($allSearchTerms as $term){
  68
  69                         # Exact match? No need to look further.
  70                         $title = Title::newFromText( $term );
  71                         if (is_null($title))
  72                                 return NULL;
  73
  74                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal()
  75                              || $title->exists() ) {
  76                                 return $title;
  77                         }
  78
  79                         # Now try all lower case (i.e. first letter capitalized)
  80                         #
  81                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  82                         if ( $title && $title->exists() ) {
  83                                 return $title;
  84                         }
  85
  86                         # Now try capitalized string
  87                         #
  88                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
  89                         if ( $title && $title->exists() ) {
  90                                 return $title;
  91                         }
  92
  93                         # Now try all upper case
  94                         #
  95                         $title = Title::newFromText( $wgContLang->uc( $term ) );
  96                         if ( $title && $title->exists() ) {
  97                                 return $title;
  98                         }
  99
 100                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 101                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
 102                         if ( $title && $title->exists() ) {
 103                                 return $title;
 104                         }
 105
 106                         // Give hooks a chance at better match variants
 107                         $title = null;
 108                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 109                                 return $title;
 110                         }
 111                 }
 112
 113                 $title = Title::newFromText( $searchterm );
 114
 115                 # Entering an IP address goes to the contributions page
 116                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 117                         || User::isIP( trim( $searchterm ) ) ) {
 118                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 119                 }
 120
 121
 122                 # Entering a user goes to the user page whether it's there or not
 123                 if ( $title->getNamespace() == NS_USER ) {
 124                         return $title;
 125                 }
 126
 127                 # Go to images that exist even if there's no local page.
 128                 # There may have been a funny upload, or it may be on a shared
 129                 # file repository such as Wikimedia Commons.
 130                 if( $title->getNamespace() == NS_FILE ) {
 131                         $image = wfFindFile( $title );
 132                         if( $image ) {
 133                                 return $title;
 134                         }
 135                 }
 136
 137                 # MediaWiki namespace? Page may be "implied" if not customized.
 138                 # Just return it, with caps forced as the message system likes it.
 139                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 140                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 141                 }
 142
 143                 # Quoted term? Try without the quotes...
 144                 $matches = array();
 145                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 146                         return SearchEngine::getNearMatch( $matches[1] );
 147                 }
 148
 149                 return NULL;
 150         }
 151
 152         public static function legalSearchChars() {
 153                 return "A-Za-z_'0-9\\x80-\\xFF\\-";
 154         }
 155
 156         /**
 157          * Set the maximum number of results to return
 158          * and how many to skip before returning the first.
 159          *
 160          * @param int $limit
 161          * @param int $offset
 162          * @access public
 163          */
 164         function setLimitOffset( $limit, $offset = 0 ) {
 165                 $this->limit = intval( $limit );
 166                 $this->offset = intval( $offset );
 167         }
 168
 169         /**
 170          * Set which namespaces the search should include.
 171          * Give an array of namespace index numbers.
 172          *
 173          * @param array $namespaces
 174          * @access public
 175          */
 176         function setNamespaces( $namespaces ) {
 177                 $this->namespaces = $namespaces;
 178         }
 179
 180         /**
 181          * Parse some common prefixes: all (search everything)
 182          * or namespace names
 183          *
 184          * @param string $query
 185          */
 186         function replacePrefixes( $query ){
 187                 global $wgContLang;
 188
 189                 if( strpos($query,':') === false )
 190                         return $query; // nothing to do
 191
 192                 $parsed = $query;
 193                 $allkeyword = wfMsgForContent('searchall').":";
 194                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 195                         $this->namespaces = null;
 196                         $parsed = substr($query,strlen($allkeyword));
 197                 } else if( strpos($query,':') !== false ) {
 198                         $prefix = substr($query,0,strpos($query,':'));
 199                         $index = $wgContLang->getNsIndex($prefix);
 200                         if($index !== false){
 201                                 $this->namespaces = array($index);
 202                                 $parsed = substr($query,strlen($prefix)+1);
 203                         }
 204                 }
 205                 if(trim($parsed) == '')
 206                         return $query; // prefix was the whole query
 207
 208                 return $parsed;
 209         }
 210
 211         /**
 212          * Make a list of searchable namespaces and their canonical names.
 213          * @return array
 214          */
 215         public static function searchableNamespaces() {
 216                 global $wgContLang;
 217                 $arr = array();
 218                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 219                         if( $ns >= NS_MAIN ) {
 220                                 $arr[$ns] = $name;
 221                         }
 222                 }
 223                 return $arr;
 224         }
 225
 226         /**
 227          * Extract default namespaces to search from the given user's
 228          * settings, returning a list of index numbers.
 229          *
 230          * @param User $user
 231          * @return array
 232          * @static
 233          */
 234         public static function userNamespaces( &$user ) {
 235                 $arr = array();
 236                 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
 237                         if( $user->getOption( 'searchNs' . $ns ) ) {
 238                                 $arr[] = $ns;
 239                         }
 240                 }
 241                 return $arr;
 242         }
 243
 244         /**
 245          * Find snippet highlight settings for a given user
 246          *
 247          * @param User $user
 248          * @return array contextlines, contextchars
 249          * @static
 250          */
 251         public static function userHighlightPrefs( &$user ){
 252                 //$contextlines = $user->getOption( 'contextlines',  5 );
 253                 //$contextchars = $user->getOption( 'contextchars', 50 );
 254                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 255                 $contextchars = 75; // same as above.... :P
 256                 return array($contextlines, $contextchars);
 257         }
 258
 259         /**
 260          * An array of namespaces indexes to be searched by default
 261          *
 262          * @return array
 263          * @static
 264          */
 265         public static function defaultNamespaces(){
 266                 global $wgNamespacesToBeSearchedDefault;
 267
 268                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 269         }
 270
 271         /**
 272          * Get a list of namespace names useful for showing in tooltips
 273          * and preferences
 274          *
 275          * @param unknown_type $namespaces
 276          */
 277         public static function namespacesAsText( $namespaces ){
 278                 global $wgContLang;
 279
 280                 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
 281                 foreach( $formatted as $key => $ns ){
 282                         if ( empty($ns) )
 283                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 284                 }
 285                 return $formatted;
 286         }
 287
 288         /**
 289          * An array of "project" namespaces indexes typically searched
 290          * by logged-in users
 291          *
 292          * @return array
 293          * @static
 294          */
 295         public static function projectNamespaces() {
 296                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 297
 298                 return array_keys( $wgNamespacesToBeSearchedProject, true );
 299         }
 300
 301         /**
 302          * An array of "project" namespaces indexes typically searched
 303          * by logged-in users in addition to the default namespaces
 304          *
 305          * @return array
 306          * @static
 307          */
 308         public static function defaultAndProjectNamespaces() {
 309                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 310
 311                 return array_keys( $wgNamespacesToBeSearchedDefault +
 312                         $wgNamespacesToBeSearchedProject, true);
 313         }
 314
 315         /**
 316          * Return a 'cleaned up' search string
 317          *
 318          * @return string
 319          * @access public
 320          */
 321         function filter( $text ) {
 322                 $lc = $this->legalSearchChars();
 323                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 324         }
 325         /**
 326          * Load up the appropriate search engine class for the currently
 327          * active database backend, and return a configured instance.
 328          *
 329          * @return SearchEngine
 330          */
 331         public static function create() {
 332                 global $wgSearchType;
 333                 $dbr = wfGetDB( DB_SLAVE );
 334                 if( $wgSearchType ) {
 335                         $class = $wgSearchType;
 336                 } else {
 337                         $class = $dbr->getSearchEngine();
 338                 }
 339                 $search = new $class( $dbr );
 340                 $search->setLimitOffset(0,0);
 341                 return $search;
 342         }
 343
 344         /**
 345          * Create or update the search index record for the given page.
 346          * Title and text should be pre-processed.
 347          *
 348          * @param int $id
 349          * @param string $title
 350          * @param string $text
 351          * @abstract
 352          */
 353         function update( $id, $title, $text ) {
 354                 // no-op
 355         }
 356
 357         /**
 358          * Update a search index record's title only.
 359          * Title should be pre-processed.
 360          *
 361          * @param int $id
 362          * @param string $title
 363          * @abstract
 364          */
 365         function updateTitle( $id, $title ) {
 366                 // no-op
 367         }
 368
 369         /**
 370          * Get OpenSearch suggestion template
 371          *
 372          * @return string
 373          * @static
 374          */
 375         public static function getOpenSearchTemplate() {
 376                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 377                 if( $wgOpenSearchTemplate )     {
 378                         return $wgOpenSearchTemplate;
 379                 } else {
 380                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
 381                         if( !$ns ) $ns = "0";
 382                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 383                 }
 384         }
 385
 386         /**
 387          * Get internal MediaWiki Suggest template
 388          *
 389          * @return string
 390          * @static
 391          */
 392         public static function getMWSuggestTemplate() {
 393                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 394                 if($wgMWSuggestTemplate)
 395                         return $wgMWSuggestTemplate;
 396                 else
 397                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
 398         }
 399 }
 400
 401 /**
 402  * @ingroup Search
 403  */
 404 class SearchResultSet {
 405         /**
 406          * Fetch an array of regular expression fragments for matching
 407          * the search terms as parsed by this engine in a text extract.
 408          *
 409          * @return array
 410          * @access public
 411          * @abstract
 412          */
 413         function termMatches() {
 414                 return array();
 415         }
 416
 417         function numRows() {
 418                 return 0;
 419         }
 420
 421         /**
 422          * Return true if results are included in this result set.
 423          * @return bool
 424          * @abstract
 425          */
 426         function hasResults() {
 427                 return false;
 428         }
 429
 430         /**
 431          * Some search modes return a total hit count for the query
 432          * in the entire article database. This may include pages
 433          * in namespaces that would not be matched on the given
 434          * settings.
 435          *
 436          * Return null if no total hits number is supported.
 437          *
 438          * @return int
 439          * @access public
 440          */
 441         function getTotalHits() {
 442                 return null;
 443         }
 444
 445         /**
 446          * Some search modes return a suggested alternate term if there are
 447          * no exact hits. Returns true if there is one on this set.
 448          *
 449          * @return bool
 450          * @access public
 451          */
 452         function hasSuggestion() {
 453                 return false;
 454         }
 455
 456         /**
 457          * @return string suggested query, null if none
 458          */
 459         function getSuggestionQuery(){
 460                 return null;
 461         }
 462
 463         /**
 464          * @return string highlighted suggested query, '' if none
 465          */
 466         function getSuggestionSnippet(){
 467                 return '';
 468         }
 469
 470         /**
 471          * Return information about how and from where the results were fetched,
 472          * should be useful for diagnostics and debugging
 473          *
 474          * @return string
 475          */
 476         function getInfo() {
 477                 return null;
 478         }
 479
 480         /**
 481          * Return a result set of hits on other (multiple) wikis associated with this one
 482          *
 483          * @return SearchResultSet
 484          */
 485         function getInterwikiResults() {
 486                 return null;
 487         }
 488
 489         /**
 490          * Check if there are results on other wikis
 491          *
 492          * @return boolean
 493          */
 494         function hasInterwikiResults() {
 495                 return $this->getInterwikiResults() != null;
 496         }
 497
 498
 499         /**
 500          * Fetches next search result, or false.
 501          * @return SearchResult
 502          * @access public
 503          * @abstract
 504          */
 505         function next() {
 506                 return false;
 507         }
 508
 509         /**
 510          * Frees the result set, if applicable.
 511          * @ access public
 512          */
 513         function free() {
 514                 // ...
 515         }
 516 }
 517
 518
 519 /**
 520  * @ingroup Search
 521  */
 522 class SearchResultTooMany {
 523         ## Some search engines may bail out if too many matches are found
 524 }
 525
 526
 527 /**
 528  * @fixme This class is horribly factored. It would probably be better to have
 529  * a useful base class to which you pass some standard information, then let
 530  * the fancy self-highlighters extend that.
 531  * @ingroup Search
 532  */
 533 class SearchResult {
 534         var $mRevision = null;
 535         var $mImage = null;
 536
 537         function __construct( $row ) {
 538                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 539                 if( !is_null($this->mTitle) ){
 540                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 541                         if( $this->mTitle->getNamespace() === NS_FILE )
 542                                 $this->mImage = wfFindFile( $this->mTitle );
 543                 }
 544         }
 545
 546         /**
 547          * Check if this is result points to an invalid title
 548          *
 549          * @return boolean
 550          * @access public
 551          */
 552         function isBrokenTitle(){
 553                 if( is_null($this->mTitle) )
 554                         return true;
 555                 return false;
 556         }
 557
 558         /**
 559          * Check if target page is missing, happens when index is out of date
 560          *
 561          * @return boolean
 562          * @access public
 563          */
 564         function isMissingRevision(){
 565                 return !$this->mRevision && !$this->mImage;
 566         }
 567
 568         /**
 569          * @return Title
 570          * @access public
 571          */
 572         function getTitle() {
 573                 return $this->mTitle;
 574         }
 575
 576         /**
 577          * @return double or null if not supported
 578          */
 579         function getScore() {
 580                 return null;
 581         }
 582
 583         /**
 584          * Lazy initialization of article text from DB
 585          */
 586         protected function initText(){
 587                 if( !isset($this->mText) ){
 588                         if($this->mRevision != null)
 589                                 $this->mText = $this->mRevision->getText();
 590                         else // TODO: can we fetch raw wikitext for commons images?
 591                                 $this->mText = '';
 592
 593                 }
 594         }
 595
 596         /**
 597          * @param array $terms terms to highlight
 598          * @return string highlighted text snippet, null (and not '') if not supported
 599          */
 600         function getTextSnippet($terms){
 601                 global $wgUser, $wgAdvancedSearchHighlighting;
 602                 $this->initText();
 603                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 604                 $h = new SearchHighlighter();
 605                 if( $wgAdvancedSearchHighlighting )
 606                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 607                 else
 608                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 609         }
 610
 611         /**
 612          * @param array $terms terms to highlight
 613          * @return string highlighted title, '' if not supported
 614          */
 615         function getTitleSnippet($terms){
 616                 return '';
 617         }
 618
 619         /**
 620          * @param array $terms terms to highlight
 621          * @return string highlighted redirect name (redirect to this page), '' if none or not supported
 622          */
 623         function getRedirectSnippet($terms){
 624                 return '';
 625         }
 626
 627         /**
 628          * @return Title object for the redirect to this page, null if none or not supported
 629          */
 630         function getRedirectTitle(){
 631                 return null;
 632         }
 633
 634         /**
 635          * @return string highlighted relevant section name, null if none or not supported
 636          */
 637         function getSectionSnippet(){
 638                 return '';
 639         }
 640
 641         /**
 642          * @return Title object (pagename+fragment) for the section, null if none or not supported
 643          */
 644         function getSectionTitle(){
 645                 return null;
 646         }
 647
 648         /**
 649          * @return string timestamp
 650          */
 651         function getTimestamp(){
 652                 if( $this->mRevision )
 653                         return $this->mRevision->getTimestamp();
 654                 else if( $this->mImage )
 655                         return $this->mImage->getTimestamp();
 656                 return '';
 657         }
 658
 659         /**
 660          * @return int number of words
 661          */
 662         function getWordCount(){
 663                 $this->initText();
 664                 return str_word_count( $this->mText );
 665         }
 666
 667         /**
 668          * @return int size in bytes
 669          */
 670         function getByteSize(){
 671                 $this->initText();
 672                 return strlen( $this->mText );
 673         }
 674
 675         /**
 676          * @return boolean if hit has related articles
 677          */
 678         function hasRelated(){
 679                 return false;
 680         }
 681
 682         /**
 683          * @return interwiki prefix of the title (return iw even if title is broken)
 684          */
 685         function getInterwikiPrefix(){
 686                 return '';
 687         }
 688 }
 689
 690 /**
 691  * Highlight bits of wikitext
 692  *
 693  * @ingroup Search
 694  */
 695 class SearchHighlighter {
 696         var $mCleanWikitext = true;
 697
 698         function SearchHighlighter($cleanupWikitext = true){
 699                 $this->mCleanWikitext = $cleanupWikitext;
 700         }
 701
 702         /**
 703          * Default implementation of wikitext highlighting
 704          *
 705          * @param string $text
 706          * @param array $terms Terms to highlight (unescaped)
 707          * @param int $contextlines
 708          * @param int $contextchars
 709          * @return string
 710          */
 711         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 712                 global $wgLang, $wgContLang;
 713                 global $wgSearchHighlightBoundaries;
 714                 $fname = __METHOD__;
 715
 716                 if($text == '')
 717                         return '';
 718
 719                 // spli text into text + templates/links/tables
 720                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 721                 // first capture group is for detecting nested templates/links/tables/references
 722                 $endPatterns = array(
 723                         1 => '/(\{\{)|(\}\})/', // template
 724                         2 => '/(\[\[)|(\]\])/', // image
 725                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 726
 727                 // FIXME: this should prolly be a hook or something
 728                 if(function_exists('wfCite')){
 729                         $spat .= '|(<ref>)'; // references via cite extension
 730                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 731                 }
 732                 $spat .= '/';
 733                 $textExt = array(); // text extracts
 734                 $otherExt = array();  // other extracts
 735                 wfProfileIn( "$fname-split" );
 736                 $start = 0;
 737                 $textLen = strlen($text);
 738                 $count = 0; // sequence number to maintain ordering
 739                 while( $start < $textLen ){
 740                         // find start of template/image/table
 741                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 742                                 $epat = '';
 743                                 foreach($matches as $key => $val){
 744                                         if($key > 0 && $val[1] != -1){
 745                                                 if($key == 2){
 746                                                         // see if this is an image link
 747                                                         $ns = substr($val[0],2,-1);
 748                                                         if( $wgContLang->getNsIndex($ns) != NS_FILE )
 749                                                                 break;
 750
 751                                                 }
 752                                                 $epat = $endPatterns[$key];
 753                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 754                                                 $start = $val[1];
 755                                                 break;
 756                                         }
 757                                 }
 758                                 if( $epat ){
 759                                         // find end (and detect any nested elements)
 760                                         $level = 0;
 761                                         $offset = $start + 1;
 762                                         $found = false;
 763                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 764                                                 if( array_key_exists(2,$endMatches) ){
 765                                                         // found end
 766                                                         if($level == 0){
 767                                                                 $len = strlen($endMatches[2][0]);
 768                                                                 $off = $endMatches[2][1];
 769                                                                 $this->splitAndAdd( $otherExt, $count,
 770                                                                         substr( $text, $start, $off + $len  - $start ) );
 771                                                                 $start = $off + $len;
 772                                                                 $found = true;
 773                                                                 break;
 774                                                         } else{
 775                                                                 // end of nested element
 776                                                                 $level -= 1;
 777                                                         }
 778                                                 } else{
 779                                                         // nested
 780                                                         $level += 1;
 781                                                 }
 782                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 783                                         }
 784                                         if( ! $found ){
 785                                                 // couldn't find appropriate closing tag, skip
 786                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 787                                                 $start += strlen($matches[0][0]);
 788                                         }
 789                                         continue;
 790                                 }
 791                         }
 792                         // else: add as text extract
 793                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 794                         break;
 795                 }
 796
 797                 $all = $textExt + $otherExt; // these have disjunct key sets
 798
 799                 wfProfileOut( "$fname-split" );
 800
 801                 // prepare regexps
 802                 foreach( $terms as $index => $term ) {
 803                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 804                         if(preg_match('/[\x80-\xff]/', $term) ){
 805                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 806                         } else {
 807                                 $terms[$index] = $term;
 808                         }
 809                 }
 810                 $anyterm = implode( '|', $terms );
 811                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 812
 813                 // FIXME: a hack to scale contextchars, a correct solution
 814                 // would be to have contextchars actually be char and not byte
 815                 // length, and do proper utf-8 substrings and lengths everywhere,
 816                 // but PHP is making that very hard and unclean to implement :(
 817                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 818                 $contextchars = intval( $contextchars * $scale );
 819
 820                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 821                 $patPost = "($wgSearchHighlightBoundaries|$)";
 822
 823                 $pat1 = "/(".$phrase.")/ui";
 824                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 825
 826                 wfProfileIn( "$fname-extract" );
 827
 828                 $left = $contextlines;
 829
 830                 $snippets = array();
 831                 $offsets = array();
 832
 833                 // show beginning only if it contains all words
 834                 $first = 0;
 835                 $firstText = '';
 836                 foreach($textExt as $index => $line){
 837                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 838                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 839                                 $first = $index;
 840                                 break;
 841                         }
 842                 }
 843                 if( $firstText ){
 844                         $succ = true;
 845                         // check if first text contains all terms
 846                         foreach($terms as $term){
 847                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 848                                         $succ = false;
 849                                         break;
 850                                 }
 851                         }
 852                         if( $succ ){
 853                                 $snippets[$first] = $firstText;
 854                                 $offsets[$first] = 0;
 855                         }
 856                 }
 857                 if( ! $snippets ) {
 858                         // match whole query on text
 859                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 860                         // match whole query on templates/tables/images
 861                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 862                         // match any words on text
 863                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 864                         // match any words on templates/tables/images
 865                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 866
 867                         ksort($snippets);
 868                 }
 869
 870                 // add extra chars to each snippet to make snippets constant size
 871                 $extended = array();
 872                 if( count( $snippets ) == 0){
 873                         // couldn't find the target words, just show beginning of article
 874                         $targetchars = $contextchars * $contextlines;
 875                         $snippets[$first] = '';
 876                         $offsets[$first] = 0;
 877                 } else{
 878                         // if begin of the article contains the whole phrase, show only that !!
 879                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 880                             && $offsets[$first] < $contextchars * 2 ){
 881                                 $snippets = array ($first => $snippets[$first]);
 882                         }
 883
 884                         // calc by how much to extend existing snippets
 885                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 886                 }
 887
 888                 foreach($snippets as $index => $line){
 889                         $extended[$index] = $line;
 890                         $len = strlen($line);
 891                         if( $len < $targetchars - 20 ){
 892                                 // complete this line
 893                                 if($len < strlen( $all[$index] )){
 894                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 895                                         $len = strlen( $extended[$index] );
 896                                 }
 897
 898                                 // add more lines
 899                                 $add = $index + 1;
 900                                 while( $len < $targetchars - 20
 901                                        && array_key_exists($add,$all)
 902                                        && !array_key_exists($add,$snippets) ){
 903                                     $offsets[$add] = 0;
 904                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 905                                         $extended[$add] = $tt;
 906                                         $len += strlen( $tt );
 907                                         $add++;
 908                                 }
 909                         }
 910                 }
 911
 912                 //$snippets = array_map('htmlspecialchars', $extended);
 913                 $snippets = $extended;
 914                 $last = -1;
 915                 $extract = '';
 916                 foreach($snippets as $index => $line){
 917                         if($last == -1)
 918                                 $extract .= $line; // first line
 919                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 920                                 $extract .= " ".$line; // continous lines
 921                         else
 922                                 $extract .= '<b> ... </b>' . $line;
 923
 924                         $last = $index;
 925                 }
 926                 if( $extract )
 927                         $extract .= '<b> ... </b>';
 928
 929                 $processed = array();
 930                 foreach($terms as $term){
 931                         if( ! isset($processed[$term]) ){
 932                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 933                                 $extract = preg_replace( $pat3,
 934                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 935                                 $processed[$term] = true;
 936                         }
 937                 }
 938
 939                 wfProfileOut( "$fname-extract" );
 940
 941                 return $extract;
 942         }
 943
 944         /**
 945          * Split text into lines and add it to extracts array
 946          *
 947          * @param array $extracts index -> $line
 948          * @param int $count
 949          * @param string $text
 950          */
 951         function splitAndAdd(&$extracts, &$count, $text){
 952                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 953                 foreach($split as $line){
 954                         $tt = trim($line);
 955                         if( $tt )
 956                                 $extracts[$count++] = $tt;
 957                 }
 958         }
 959
 960         /**
 961          * Do manual case conversion for non-ascii chars
 962          *
 963          * @param unknown_type $matches
 964          */
 965         function caseCallback($matches){
 966                 global $wgContLang;
 967                 if( strlen($matches[0]) > 1 ){
 968                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 969                 } else
 970                         return $matches[0];
 971         }
 972
 973         /**
 974          * Extract part of the text from start to end, but by
 975          * not chopping up words
 976          * @param string $text
 977          * @param int $start
 978          * @param int $end
 979          * @param int $posStart (out) actual start position
 980          * @param int $posEnd (out) actual end position
 981          * @return string
 982          */
 983         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 984                 global $wgContLang;
 985
 986                 if( $start != 0)
 987                         $start = $this->position( $text, $start, 1 );
 988                 if( $end >= strlen($text) )
 989                         $end = strlen($text);
 990                 else
 991                         $end = $this->position( $text, $end );
 992
 993                 if(!is_null($posStart))
 994                         $posStart = $start;
 995                 if(!is_null($posEnd))
 996                         $posEnd = $end;
 997
 998                 if($end > $start)
 999                         return substr($text, $start, $end-$start);
1000                 else
1001                         return '';
1002         }
1003
1004         /**
1005          * Find a nonletter near a point (index) in the text
1006          *
1007          * @param string $text
1008          * @param int $point
1009          * @param int $offset to found index
1010          * @return int nearest nonletter index, or beginning of utf8 char if none
1011          */
1012         function position($text, $point, $offset=0 ){
1013                 $tolerance = 10;
1014                 $s = max( 0, $point - $tolerance );
1015                 $l = min( strlen($text), $point + $tolerance ) - $s;
1016                 $m = array();
1017                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1018                         return $m[0][1] + $s + $offset;
1019                 } else{
1020                         // check if point is on a valid first UTF8 char
1021                         $char = ord( $text[$point] );
1022                         while( $char >= 0x80 && $char < 0xc0 ) {
1023                                 // skip trailing bytes
1024                                 $point++;
1025                                 if($point >= strlen($text))
1026                                         return strlen($text);
1027                                 $char = ord( $text[$point] );
1028                         }
1029                         return $point;
1030
1031                 }
1032         }
1033
1034         /**
1035          * Search extracts for a pattern, and return snippets
1036          *
1037          * @param string $pattern regexp for matching lines
1038          * @param array $extracts extracts to search
1039          * @param int $linesleft number of extracts to make
1040          * @param int $contextchars length of snippet
1041          * @param array $out map for highlighted snippets
1042          * @param array $offsets map of starting points of snippets
1043          * @protected
1044          */
1045         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1046                 if($linesleft == 0)
1047                         return; // nothing to do
1048                 foreach($extracts as $index => $line){
1049                         if( array_key_exists($index,$out) )
1050                                 continue; // this line already highlighted
1051
1052                         $m = array();
1053                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1054                                 continue;
1055
1056                         $offset = $m[0][1];
1057                         $len = strlen($m[0][0]);
1058                         if($offset + $len < $contextchars)
1059                                 $begin = 0;
1060                         elseif( $len > $contextchars)
1061                                 $begin = $offset;
1062                         else
1063                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1064
1065                         $end = $begin + $contextchars;
1066
1067                         $posBegin = $begin;
1068                         // basic snippet from this line
1069                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1070                         $offsets[$index] = $posBegin;
1071                         $linesleft--;
1072                         if($linesleft == 0)
1073                                 return;
1074                 }
1075         }
1076
1077         /**
1078          * Basic wikitext removal
1079          * @protected
1080          */
1081         function removeWiki($text) {
1082                 $fname = __METHOD__;
1083                 wfProfileIn( $fname );
1084
1085                 //$text = preg_replace("/'{2,5}/", "", $text);
1086                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1087                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1088                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1089                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1090                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1091                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1092                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1093                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1094                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1095                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1096                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1097                 $text = preg_replace("/'''''/", "", $text);
1098                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1099                 $text = preg_replace("/''/", "", $text);
1100
1101                 wfProfileOut( $fname );
1102                 return $text;
1103         }
1104
1105         /**
1106          * callback to replace [[target|caption]] kind of links, if
1107          * the target is category or image, leave it
1108          *
1109          * @param array $matches
1110          */
1111         function linkReplace($matches){
1112                 $colon = strpos( $matches[1], ':' );
1113                 if( $colon === false )
1114                         return $matches[2]; // replace with caption
1115                 global $wgContLang;
1116                 $ns = substr( $matches[1], 0, $colon );
1117                 $index = $wgContLang->getNsIndex($ns);
1118                 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1119                         return $matches[0]; // return the whole thing
1120                 else
1121                         return $matches[2];
1122
1123         }
1124
1125         /**
1126      * Simple & fast snippet extraction, but gives completely unrelevant
1127      * snippets
1128      *
1129      * @param string $text
1130      * @param array $terms
1131      * @param int $contextlines
1132      * @param int $contextchars
1133      * @return string
1134      */
1135     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1136         global $wgLang, $wgContLang;
1137         $fname = __METHOD__;
1138
1139         $lines = explode( "\n", $text );
1140
1141         $terms = implode( '|', $terms );
1142         $max = intval( $contextchars ) + 1;
1143         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1144
1145         $lineno = 0;
1146
1147         $extract = "";
1148         wfProfileIn( "$fname-extract" );
1149         foreach ( $lines as $line ) {
1150             if ( 0 == $contextlines ) {
1151                 break;
1152             }
1153             ++$lineno;
1154             $m = array();
1155             if ( ! preg_match( $pat1, $line, $m ) ) {
1156                 continue;
1157             }
1158             --$contextlines;
1159             $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
1160
1161             if ( count( $m ) < 3 ) {
1162                 $post = '';
1163             } else {
1164                 $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
1165             }
1166
1167             $found = $m[2];
1168
1169             $line = htmlspecialchars( $pre . $found . $post );
1170             $pat2 = '/(' . $terms . ")/i";
1171             $line = preg_replace( $pat2,
1172               "<span class='searchmatch'>\\1</span>", $line );
1173
1174             $extract .= "${line}\n";
1175         }
1176         wfProfileOut( "$fname-extract" );
1177
1178         return $extract;
1179     }
1180
1181 }
1182
1183 /**
1184  * Dummy class to be used when non-supported Database engine is present.
1185  * @fixme Dummy class should probably try something at least mildly useful,
1186  * such as a LIKE search through titles.
1187  * @ingroup Search
1188  */
1189 class SearchEngineDummy extends SearchEngine {
1190         // no-op
1191 }