includes/SearchEngine.php

   1 <?php
   2 /**
   3  * Contain a class for special pages
   4  * @addtogroup Search
   5  */
   6 class SearchEngine {
   7         var $limit = 10;
   8         var $offset = 0;
   9         var $searchTerms = array();
  10         var $namespaces = array( NS_MAIN );
  11         var $showRedirects = false;
  12
  13         /**
  14          * Perform a full text search query and return a result set.
  15          * If title searches are not supported or disabled, return null.
  16          *
  17          * @param string $term - Raw search term
  18          * @return SearchResultSet
  19          * @access public
  20          * @abstract
  21          */
  22         function searchText( $term ) {
  23                 return null;
  24         }
  25
  26         /**
  27          * Perform a title-only search query and return a result set.
  28          * If title searches are not supported or disabled, return null.
  29          *
  30          * @param string $term - Raw search term
  31          * @return SearchResultSet
  32          * @access public
  33          * @abstract
  34          */
  35         function searchTitle( $term ) {
  36                 return null;
  37         }
  38
  39         /**
  40          * If an exact title match can be find, or a very slightly close match,
  41          * return the title. If no match, returns NULL.
  42          *
  43          * @param string $term
  44          * @return Title
  45          */
  46         public static function getNearMatch( $searchterm ) {
  47                 global $wgContLang;
  48
  49                 $allSearchTerms = array($searchterm);
  50
  51                 if($wgContLang->hasVariants()){
  52                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  53                 }
  54
  55                 foreach($allSearchTerms as $term){
  56
  57                         # Exact match? No need to look further.
  58                         $title = Title::newFromText( $term );
  59                         if (is_null($title))
  60                                 return NULL;
  61
  62                         if ( $title->getNamespace() == NS_SPECIAL || $title->exists() ) {
  63                                 return $title;
  64                         }
  65
  66                         # Now try all lower case (i.e. first letter capitalized)
  67                         #
  68                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  69                         if ( $title->exists() ) {
  70                                 return $title;
  71                         }
  72
  73                         # Now try capitalized string
  74                         #
  75                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
  76                         if ( $title->exists() ) {
  77                                 return $title;
  78                         }
  79
  80                         # Now try all upper case
  81                         #
  82                         $title = Title::newFromText( $wgContLang->uc( $term ) );
  83                         if ( $title->exists() ) {
  84                                 return $title;
  85                         }
  86
  87                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
  88                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
  89                         if ( $title->exists() ) {
  90                                 return $title;
  91                         }
  92
  93                         global $wgCapitalLinks, $wgContLang;
  94                         if( !$wgCapitalLinks ) {
  95                                 // Catch differs-by-first-letter-case-only
  96                                 $title = Title::newFromText( $wgContLang->ucfirst( $term ) );
  97                                 if ( $title->exists() ) {
  98                                         return $title;
  99                                 }
 100                                 $title = Title::newFromText( $wgContLang->lcfirst( $term ) );
 101                                 if ( $title->exists() ) {
 102                                         return $title;
 103                                 }
 104                         }
 105
 106                         // Give hooks a chance at better match variants
 107                         $title = null;
 108                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 109                                 return $title;
 110                         }
 111                 }
 112
 113                 $title = Title::newFromText( $searchterm );
 114
 115                 # Entering an IP address goes to the contributions page
 116                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 117                         || User::isIP( trim( $searchterm ) ) ) {
 118                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 119                 }
 120
 121
 122                 # Entering a user goes to the user page whether it's there or not
 123                 if ( $title->getNamespace() == NS_USER ) {
 124                         return $title;
 125                 }
 126
 127                 # Go to images that exist even if there's no local page.
 128                 # There may have been a funny upload, or it may be on a shared
 129                 # file repository such as Wikimedia Commons.
 130                 if( $title->getNamespace() == NS_IMAGE ) {
 131                         $image = wfFindFile( $title );
 132                         if( $image ) {
 133                                 return $title;
 134                         }
 135                 }
 136
 137                 # MediaWiki namespace? Page may be "implied" if not customized.
 138                 # Just return it, with caps forced as the message system likes it.
 139                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 140                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 141                 }
 142
 143                 # Quoted term? Try without the quotes...
 144                 $matches = array();
 145                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 146                         return SearchEngine::getNearMatch( $matches[1] );
 147                 }
 148
 149                 return NULL;
 150         }
 151
 152         public static function legalSearchChars() {
 153                 return "A-Za-z_'0-9\\x80-\\xFF\\-";
 154         }
 155
 156         /**
 157          * Set the maximum number of results to return
 158          * and how many to skip before returning the first.
 159          *
 160          * @param int $limit
 161          * @param int $offset
 162          * @access public
 163          */
 164         function setLimitOffset( $limit, $offset = 0 ) {
 165                 $this->limit = intval( $limit );
 166                 $this->offset = intval( $offset );
 167         }
 168
 169         /**
 170          * Set which namespaces the search should include.
 171          * Give an array of namespace index numbers.
 172          *
 173          * @param array $namespaces
 174          * @access public
 175          */
 176         function setNamespaces( $namespaces ) {
 177                 $this->namespaces = $namespaces;
 178         }
 179
 180         /**
 181          * Parse some common prefixes: all (search everything)
 182          * or namespace names
 183          *
 184          * @param string $query
 185          */
 186         function replacePrefixes( $query ){
 187                 global $wgContLang;
 188
 189                 if( strpos($query,':') === false )
 190                         return $query; // nothing to do
 191
 192                 $parsed = $query;
 193                 $allkeyword = wfMsgForContent('searchall').":";
 194                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 195                         $this->namespaces = null;
 196                         $parsed = substr($query,strlen($allkeyword));
 197                 } else if( strpos($query,':') !== false ) {
 198                         $prefix = substr($query,0,strpos($query,':'));
 199                         $index = $wgContLang->getNsIndex($prefix);
 200                         if($index !== false){
 201                                 $this->namespaces = array($index);
 202                                 $parsed = substr($query,strlen($prefix)+1);
 203                         }
 204                 }
 205                 if(trim($parsed) == '')
 206                         return $query; // prefix was the whole query
 207
 208                 return $parsed;
 209         }
 210
 211         /**
 212          * Make a list of searchable namespaces and their canonical names.
 213          * @return array
 214          */
 215         public static function searchableNamespaces() {
 216                 global $wgContLang;
 217                 $arr = array();
 218                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 219                         if( $ns >= NS_MAIN ) {
 220                                 $arr[$ns] = $name;
 221                         }
 222                 }
 223                 return $arr;
 224         }
 225
 226         /**
 227          * Extract default namespaces to search from the given user's
 228          * settings, returning a list of index numbers.
 229          *
 230          * @param User $user
 231          * @return array
 232          * @static
 233          */
 234         public static function userNamespaces( &$user ) {
 235                 $arr = array();
 236                 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
 237                         if( $user->getOption( 'searchNs' . $ns ) ) {
 238                                 $arr[] = $ns;
 239                         }
 240                 }
 241                 return $arr;
 242         }
 243
 244         /**
 245          * Find snippet highlight settings for a given user
 246          *
 247          * @param User $user
 248          * @return array contextlines, contextchars
 249          * @static
 250          */
 251         public static function userHighlightPrefs( &$user ){
 252                 //$contextlines = $user->getOption( 'contextlines',  5 );
 253                 //$contextchars = $user->getOption( 'contextchars', 50 );
 254                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 255                 $contextchars = 75; // same as above.... :P
 256                 return array($contextlines, $contextchars);
 257         }
 258
 259         /**
 260          * An array of namespaces indexes to be searched by default
 261          *
 262          * @return array
 263          * @static
 264          */
 265         public static function defaultNamespaces(){
 266                 global $wgNamespacesToBeSearchedDefault;
 267
 268                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 269         }
 270
 271         /**
 272          * Return a 'cleaned up' search string
 273          *
 274          * @return string
 275          * @access public
 276          */
 277         function filter( $text ) {
 278                 $lc = $this->legalSearchChars();
 279                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 280         }
 281         /**
 282          * Load up the appropriate search engine class for the currently
 283          * active database backend, and return a configured instance.
 284          *
 285          * @return SearchEngine
 286          */
 287         public static function create() {
 288                 global $wgDBtype, $wgSearchType;
 289                 if( $wgSearchType ) {
 290                         $class = $wgSearchType;
 291                 } elseif( $wgDBtype == 'mysql' ) {
 292                         $class = 'SearchMySQL';
 293                 } else if ( $wgDBtype == 'postgres' ) {
 294                         $class = 'SearchPostgres';
 295                 } else if ( $wgDBtype == 'oracle' ) {
 296                         $class = 'SearchOracle';
 297                 } else {
 298                         $class = 'SearchEngineDummy';
 299                 }
 300                 $search = new $class( wfGetDB( DB_SLAVE ) );
 301                 $search->setLimitOffset(0,0);
 302                 return $search;
 303         }
 304
 305         /**
 306          * Create or update the search index record for the given page.
 307          * Title and text should be pre-processed.
 308          *
 309          * @param int $id
 310          * @param string $title
 311          * @param string $text
 312          * @abstract
 313          */
 314         function update( $id, $title, $text ) {
 315                 // no-op
 316         }
 317
 318         /**
 319          * Update a search index record's title only.
 320          * Title should be pre-processed.
 321          *
 322          * @param int $id
 323          * @param string $title
 324          * @abstract
 325          */
 326         function updateTitle( $id, $title ) {
 327                 // no-op
 328         }
 329
 330         /**
 331          * Get OpenSearch suggestion template
 332          *
 333          * @return string
 334          * @static
 335          */
 336         public static function getOpenSearchTemplate() {
 337                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 338                 if($wgOpenSearchTemplate)
 339                         return $wgOpenSearchTemplate;
 340                 else{
 341                         $ns = implode(',',SearchEngine::defaultNamespaces());
 342                         if(!$ns) $ns = "0";
 343                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 344                 }
 345         }
 346
 347         /**
 348          * Get internal MediaWiki Suggest template
 349          *
 350          * @return string
 351          * @static
 352          */
 353         public static function getMWSuggestTemplate() {
 354                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 355                 if($wgMWSuggestTemplate)
 356                         return $wgMWSuggestTemplate;
 357                 else
 358                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
 359         }
 360 }
 361
 362
 363 /**
 364  * @addtogroup Search
 365  */
 366 class SearchResultSet {
 367         /**
 368          * Fetch an array of regular expression fragments for matching
 369          * the search terms as parsed by this engine in a text extract.
 370          *
 371          * @return array
 372          * @access public
 373          * @abstract
 374          */
 375         function termMatches() {
 376                 return array();
 377         }
 378
 379         function numRows() {
 380                 return 0;
 381         }
 382
 383         /**
 384          * Return true if results are included in this result set.
 385          * @return bool
 386          * @abstract
 387          */
 388         function hasResults() {
 389                 return false;
 390         }
 391
 392         /**
 393          * Some search modes return a total hit count for the query
 394          * in the entire article database. This may include pages
 395          * in namespaces that would not be matched on the given
 396          * settings.
 397          *
 398          * Return null if no total hits number is supported.
 399          *
 400          * @return int
 401          * @access public
 402          */
 403         function getTotalHits() {
 404                 return null;
 405         }
 406
 407         /**
 408          * Some search modes return a suggested alternate term if there are
 409          * no exact hits. Returns true if there is one on this set.
 410          *
 411          * @return bool
 412          * @access public
 413          */
 414         function hasSuggestion() {
 415                 return false;
 416         }
 417
 418         /**
 419          * @return string suggested query, null if none
 420          */
 421         function getSuggestionQuery(){
 422                 return null;
 423         }
 424
 425         /**
 426          * @return string highlighted suggested query, '' if none
 427          */
 428         function getSuggestionSnippet(){
 429                 return '';
 430         }
 431
 432         /**
 433          * Return information about how and from where the results were fetched,
 434          * should be useful for diagnostics and debugging
 435          *
 436          * @return string
 437          */
 438         function getInfo() {
 439                 return null;
 440         }
 441
 442         /**
 443          * Return a result set of hits on other (multiple) wikis associated with this one
 444          *
 445          * @return SearchResultSet
 446          */
 447         function getInterwikiResults() {
 448                 return null;
 449         }
 450
 451         /**
 452          * Check if there are results on other wikis
 453          *
 454          * @return boolean
 455          */
 456         function hasInterwikiResults() {
 457                 return $this->getInterwikiResults() != null;
 458         }
 459
 460
 461         /**
 462          * Fetches next search result, or false.
 463          * @return SearchResult
 464          * @access public
 465          * @abstract
 466          */
 467         function next() {
 468                 return false;
 469         }
 470
 471         /**
 472          * Frees the result set, if applicable.
 473          * @ access public
 474          */
 475         function free() {
 476                 // ...
 477         }
 478 }
 479
 480
 481 /**
 482  * @addtogroup Search
 483  */
 484 class SearchResultTooMany {
 485         ## Some search engines may bail out if too many matches are found
 486 }
 487
 488
 489 /**
 490  * @addtogroup Search
 491  */
 492 class SearchResult {
 493         var $mRevision = null;
 494
 495         function SearchResult( $row ) {
 496                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 497                 if( !is_null($this->mTitle) )
 498                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 499         }
 500
 501         /**
 502          * Check if this is result points to an invalid title
 503          *
 504          * @return boolean
 505          * @access public
 506          */
 507         function isBrokenTitle(){
 508                 if( is_null($this->mTitle) )
 509                         return true;
 510                 return false;
 511         }
 512
 513         /**
 514          * Check if target page is missing, happens when index is out of date
 515          *
 516          * @return boolean
 517          * @access public
 518          */
 519         function isMissingRevision(){
 520                 if( !$this->mRevision )
 521                         return true;
 522                 return false;
 523         }
 524
 525         /**
 526          * @return Title
 527          * @access public
 528          */
 529         function getTitle() {
 530                 return $this->mTitle;
 531         }
 532
 533         /**
 534          * @return double or null if not supported
 535          */
 536         function getScore() {
 537                 return null;
 538         }
 539
 540         /**
 541          * Lazy initialization of article text from DB
 542          */
 543         protected function initText(){
 544                 if( !isset($this->mText) ){
 545                         $this->mText = $this->mRevision->getText();
 546                 }
 547         }
 548
 549         /**
 550          * @param array $terms Terms to highlight (unescaped)
 551          * @return string highlighted text snippet, null (and not '') if not supported
 552          */
 553         function getTextSnippet($terms){
 554                 global $wgUser;
 555                 $this->initText();
 556                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 557                 $h = new SearchHighlighter();
 558                 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars);
 559         }
 560
 561         /**
 562          * @param array $terms terms to highlight
 563          * @return string highlighted title, '' if not supported
 564          */
 565         function getTitleSnippet($terms){
 566                 return '';
 567         }
 568
 569         /**
 570          * @param array $terms terms to highlight
 571          * @return string highlighted redirect name (redirect to this page), '' if none or not supported
 572          */
 573         function getRedirectSnippet($terms){
 574                 return '';
 575         }
 576
 577         /**
 578          * @return Title object for the redirect to this page, null if none or not supported
 579          */
 580         function getRedirectTitle(){
 581                 return null;
 582         }
 583
 584         /**
 585          * @return string highlighted relevant section name, null if none or not supported
 586          */
 587         function getSectionSnippet(){
 588                 return '';
 589         }
 590
 591         /**
 592          * @return Title object (pagename+fragment) for the section, null if none or not supported
 593          */
 594         function getSectionTitle(){
 595                 return null;
 596         }
 597
 598         /**
 599          * @return string timestamp
 600          */
 601         function getTimestamp(){
 602                 return $this->mRevision->getTimestamp();
 603         }
 604
 605         /**
 606          * @return int number of words
 607          */
 608         function getWordCount(){
 609                 $this->initText();
 610                 return str_word_count( $this->mText );
 611         }
 612
 613         /**
 614          * @return int size in bytes
 615          */
 616         function getByteSize(){
 617                 $this->initText();
 618                 return strlen( $this->mText );
 619         }
 620
 621         /**
 622          * @return boolean if hit has related articles
 623          */
 624         function hasRelated(){
 625                 return false;
 626         }
 627
 628         /**
 629          * @return interwiki prefix of the title (return iw even if title is broken)
 630          */
 631         function getInterwikiPrefix(){
 632                 return '';
 633         }
 634 }
 635
 636 /**
 637  * Highlight bits of wikitext
 638  *
 639  * @addtogroup Search
 640  */
 641 class SearchHighlighter {
 642         var $mCleanWikitext = true;
 643
 644         function SearchHighlighter($cleanupWikitext = true){
 645                 $this->mCleanWikitext = $cleanupWikitext;
 646         }
 647
 648         /**
 649          * Default implementation of wikitext highlighting
 650          *
 651          * @param string $text
 652          * @param array $terms Terms to highlight (unescaped)
 653          * @param int $contextlines
 654          * @param int $contextchars
 655          * @return string
 656          */
 657         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 658                 global $wgLang, $wgContLang;
 659                 $fname = __METHOD__;
 660
 661                 if($text == '')
 662                         return '';
 663
 664                 // spli text into text + templates/links/tables
 665                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)/";
 666                 // first capture group is for detecting nested templates/links/tables
 667                 $endPatterns = array(
 668                         1 => '/(\{\{)|(\}\})/', // template
 669                         2 => '/(\[\[)|(\]\])/', // image
 670                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 671                 $textExt = array(); // text extracts
 672                 $otherExt = array();  // other extracts
 673                 wfProfileIn( "$fname-split" );
 674                 $start = 0;
 675                 $textLen = strlen($text);
 676                 $count = 0; // sequence number to maintain ordering
 677                 while( $start < $textLen ){
 678                         // find start of template/image/table
 679                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 680                                 $epat = '';
 681                                 foreach($matches as $key => $val){
 682                                         if($key > 0 && $val[1] != -1){
 683                                                 if($key == 2){
 684                                                         // see if this is an image link
 685                                                         $ns = substr($val[0],2,-1);
 686                                                         if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
 687                                                                 break;
 688
 689                                                 }
 690                                                 $epat = $endPatterns[$key];
 691                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 692                                                 $start = $val[1];
 693                                                 break;
 694                                         }
 695                                 }
 696                                 if( $epat ){
 697                                         // find end (and detect any nested elements)
 698                                         $level = 0;
 699                                         $offset = $start + 1;
 700                                         $found = false;
 701                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 702                                                 if( array_key_exists(2,$endMatches) ){
 703                                                         // found end
 704                                                         if($level == 0){
 705                                                                 $len = strlen($endMatches[2][0]);
 706                                                                 $off = $endMatches[2][1];
 707                                                                 $this->splitAndAdd( $otherExt, $count,
 708                                                                         substr( $text, $start, $off + $len  - $start ) );
 709                                                                 $start = $off + $len;
 710                                                                 $found = true;
 711                                                                 break;
 712                                                         } else{
 713                                                                 // end of nested element
 714                                                                 $level -= 1;
 715                                                         }
 716                                                 } else{
 717                                                         // nested
 718                                                         $level += 1;
 719                                                 }
 720                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 721                                         }
 722                                         if( ! $found ){
 723                                                 // couldn't find appropriate closing tag, skip
 724                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 725                                                 $start += strlen($matches[0][0]);
 726                                         }
 727                                         continue;
 728                                 }
 729                         }
 730                         // else: add as text extract
 731                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 732                         break;
 733                 }
 734
 735                 $all = $textExt + $otherExt; // these have disjunct key sets
 736
 737                 wfProfileOut( "$fname-split" );
 738
 739                 // prepare regexps
 740                 foreach( $terms as $index => $term ) {
 741                         $terms[$index] = preg_quote( $term, '/' );
 742                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 743                         if(preg_match('/[\x80-\xff]/', $term) ){
 744                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 745                         }
 746
 747
 748                 }
 749                 $anyterm = implode( '|', $terms );
 750                 $phrase = implode('[, .:;\(\)"\'\-\+]+', $terms );
 751
 752                 // FIXME: a hack to scale contextchars, a correct solution
 753                 // would be to have contextchars actually be char and not byte
 754                 // length, and do proper utf-8 substrings and lengths everywhere,
 755                 // but PHP is making that very hard and unclean to implement :(
 756                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 757                 $contextchars = intval( $contextchars * $scale );
 758
 759                 $pat1 = '/('.$phrase.')/ui';
 760                 $pat2 = '/('.$anyterm.')/ui';
 761
 762                 wfProfileIn( "$fname-extract" );
 763
 764                 $left = $contextlines;
 765
 766                 $snippets = array();
 767                 $offsets = array();
 768                 // match whole query on text
 769                 $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 770                 // match whole query on templates/tables/images
 771                 $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 772                 // match any words on text
 773                 $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 774                 // match any words on templates/tables/images
 775                 $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 776
 777                 ksort($snippets);
 778
 779                 $first = array_keys($textExt);
 780                 if( isset($first[0]))
 781                         $first = $first[0];
 782                 else
 783                         $first = 0;
 784
 785                 // add extra chars to each snippet to make snippets constant size
 786                 $extended = array();
 787                 if( count( $snippets ) == 0){
 788                         // couldn't find the target words, just show beginning of article
 789                         $targetchars = $contextchars * $contextlines;
 790                         $snippets[$first] = '';
 791                         $offsets[$first] = 0;
 792                 } else{
 793                         // if begin of the article contains the whole phrase, show only that !!
 794                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 795                             && $offsets[$first] < $contextchars * 2 ){
 796                                 $snippets = array ($first => $snippets[$first]);
 797                         }
 798
 799                         // calc by how much to extend existing snippets
 800                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 801                 }
 802
 803                 foreach($snippets as $index => $line){
 804                         $extended[$index] = $line;
 805                         $len = strlen($line);
 806                         if( $len < $targetchars - 20 ){
 807                                 // complete this line
 808                                 if($len < strlen( $all[$index] )){
 809                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 810                                         $len = strlen( $extended[$index] );
 811                                 }
 812
 813                                 // add more lines
 814                                 $add = $index + 1;
 815                                 while( $len < $targetchars - 20
 816                                        && array_key_exists($add,$all)
 817                                        && !array_key_exists($add,$snippets) ){
 818                                     $offsets[$add] = 0;
 819                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 820                                         $extended[$add] = $tt;
 821                                         $len += strlen( $tt );
 822                                         $add++;
 823                                 }
 824                         }
 825                 }
 826
 827                 $snippets = array_map('htmlspecialchars', $extended);
 828                 $last = -1;
 829                 $extract = '';
 830                 foreach($snippets as $index => $line){
 831                         if($last == -1)
 832                                 $extract .= $line; // first line
 833                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 834                                 $extract .= " ".$line; // continous lines
 835                         else
 836                                 $extract .= '<b> ... </b>' . $line;
 837
 838                         $last = $index;
 839                 }
 840                 if( $extract )
 841                         $extract .= '<b> ... </b>';
 842
 843                 // highlight words
 844                 $pat3 = '/(' . $anyterm . ")/ui";
 845                 $extract = preg_replace( $pat3,
 846                           "<span class='searchmatch'>\\1</span>", $extract );
 847
 848                 wfProfileOut( "$fname-extract" );
 849
 850                 return $extract;
 851         }
 852
 853         /**
 854          * Split text into lines and add it to extracts array
 855          *
 856          * @param array $extracts index -> $line
 857          * @param int $count
 858          * @param string $text
 859          */
 860         function splitAndAdd(&$extracts, &$count, $text){
 861                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 862                 foreach($split as $line){
 863                         $tt = trim($line);
 864                         if( $tt )
 865                                 $extracts[$count++] = $tt;
 866                 }
 867         }
 868
 869         /**
 870          * Do manual case conversion for non-ascii chars
 871          *
 872          * @param unknown_type $matches
 873          */
 874         function caseCallback($matches){
 875                 global $wgContLang;
 876                 if( strlen($matches[0]) > 1 ){
 877                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 878                 } else
 879                         return $matches[0];
 880         }
 881
 882         /**
 883          * Extract part of the text from start to end, but by
 884          * not chopping up words
 885          * @param string $text
 886          * @param int $start
 887          * @param int $end
 888          * @param int $posStart (out) actual start position
 889          * @param int $posEnd (out) actual end position
 890          * @return string
 891          */
 892         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 893                 global $wgContLang;
 894
 895                 if( $start != 0)
 896                         $start = $this->position( $text, $start, 1 );
 897                 if( $end >= strlen($text) )
 898                         $end = strlen($text);
 899                 else
 900                         $end = $this->position( $text, $end );
 901
 902                 if(!is_null($posStart))
 903                         $posStart = $start;
 904                 if(!is_null($posEnd))
 905                         $posEnd = $end;
 906
 907                 if($end > $start)
 908                         return substr($text, $start, $end-$start);
 909                 else
 910                         return '';
 911         }
 912
 913         /**
 914          * Find a nonletter near a point (index) in the text
 915          *
 916          * @param string $text
 917          * @param int $point
 918          * @param int $offset to found index
 919          * @return int nearest nonletter index, or beginning of utf8 char if none
 920          */
 921         function position($text, $point, $offset=0 ){
 922                 $tolerance = 10;
 923                 $s = max( 0, $point - $tolerance );
 924                 $l = min( strlen($text), $point + $tolerance ) - $s;
 925                 $m = array();
 926                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
 927                         return $m[0][1] + $s + $offset;
 928                 } else{
 929                         // check if point is on a valid first UTF8 char
 930                         $char = ord( $text[$point] );
 931                         while( $char >= 0x80 && $char < 0xc0 ) {
 932                                 // skip trailing bytes
 933                                 $point++;
 934                                 if($point >= strlen($text))
 935                                         return strlen($text);
 936                                 $char = ord( $text[$point] );
 937                         }
 938                         return $point;
 939
 940                 }
 941         }
 942
 943         /**
 944          * Search extracts for a pattern, and return snippets
 945          *
 946          * @param string $pattern regexp for matching lines
 947          * @param array $extracts extracts to search
 948          * @param int $linesleft number of extracts to make
 949          * @param int $contextchars length of snippet
 950          * @param array $out map for highlighted snippets
 951          * @param array $offsets map of starting points of snippets
 952          * @protected
 953          */
 954         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
 955                 if($linesleft == 0)
 956                         return; // nothing to do
 957                 foreach($extracts as $index => $line){
 958                         if( array_key_exists($index,$out) )
 959                                 continue; // this line already highlighted
 960
 961                         $m = array();
 962                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
 963                                 continue;
 964
 965                         $offset = $m[0][1];
 966                         $len = strlen($m[0][0]);
 967                         if($offset + $len < $contextchars)
 968                                 $begin = 0;
 969                         elseif( $len > $contextchars)
 970                                 $begin = $offset;
 971                         else
 972                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
 973
 974                         $end = $begin + $contextchars;
 975
 976                         $posBegin = $begin;
 977                         // basic snippet from this line
 978                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
 979                         $offsets[$index] = $posBegin;
 980                         $linesleft--;
 981                         if($linesleft == 0)
 982                                 return;
 983                 }
 984         }
 985
 986         /**
 987          * Basic wikitext removal
 988          * @protected
 989          */
 990         function removeWiki($text) {
 991                 $fname = __METHOD__;
 992                 wfProfileIn( $fname );
 993
 994                 //$text = preg_replace("/'{2,5}/", "", $text);
 995                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
 996                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
 997                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
 998                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
 999                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1000                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1001                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1002                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1003                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1004                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1005                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1006                 $text = preg_replace("/'''''/", "", $text);
1007                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1008                 $text = preg_replace("/''/", "", $text);
1009
1010                 wfProfileOut( $fname );
1011                 return $text;
1012         }
1013
1014         /**
1015          * callback to replace [[target|caption]] kind of links, if
1016          * the target is category or image, leave it
1017          *
1018          * @param array $matches
1019          */
1020         function linkReplace($matches){
1021                 $colon = strpos( $matches[1], ':' );
1022                 if( $colon === false )
1023                         return $matches[2]; // replace with caption
1024                 global $wgContLang;
1025                 $ns = substr( $matches[1], 0, $colon );
1026                 $index = $wgContLang->getNsIndex($ns);
1027                 if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
1028                         return $matches[0]; // return the whole thing
1029                 else
1030                         return $matches[2];
1031
1032         }
1033 }
1034
1035 /**
1036  * @addtogroup Search
1037  */
1038 class SearchEngineDummy {
1039         function search( $term ) {
1040                 return null;
1041         }
1042         function setLimitOffset($l, $o) {}
1043         function legalSearchChars() {}
1044         function update() {}
1045         function setnamespaces() {}
1046         function searchtitle() {}
1047         function searchtext() {}
1048 }