includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $searchTerms = array();
  17         var $namespaces = array( NS_MAIN );
  18         var $showRedirects = false;
  19
  20         /**
  21          * Perform a full text search query and return a result set.
  22          * If title searches are not supported or disabled, return null.
  23          *
  24          * @param string $term - Raw search term
  25          * @return SearchResultSet
  26          * @access public
  27          * @abstract
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          *
  37          * @param string $term - Raw search term
  38          * @return SearchResultSet
  39          * @access public
  40          * @abstract
  41          */
  42         function searchTitle( $term ) {
  43                 return null;
  44         }
  45
  46         /** If this search backend can list/unlist redirects */
  47         function acceptListRedirects() {
  48                 return true;
  49         }
  50
  51         /**
  52          * If an exact title match can be find, or a very slightly close match,
  53          * return the title. If no match, returns NULL.
  54          *
  55          * @param string $term
  56          * @return Title
  57          */
  58         public static function getNearMatch( $searchterm ) {
  59                 global $wgContLang;
  60
  61                 $allSearchTerms = array($searchterm);
  62
  63                 if($wgContLang->hasVariants()){
  64                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  65                 }
  66
  67                 foreach($allSearchTerms as $term){
  68
  69                         # Exact match? No need to look further.
  70                         $title = Title::newFromText( $term );
  71                         if (is_null($title))
  72                                 return NULL;
  73
  74                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal()
  75                              || $title->exists() ) {
  76                                 return $title;
  77                         }
  78
  79                         # Now try all lower case (i.e. first letter capitalized)
  80                         #
  81                         $title = Title::newFromText( $wgContLang->lc( $term ) );
  82                         if ( $title && $title->exists() ) {
  83                                 return $title;
  84                         }
  85
  86                         # Now try capitalized string
  87                         #
  88                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
  89                         if ( $title && $title->exists() ) {
  90                                 return $title;
  91                         }
  92
  93                         # Now try all upper case
  94                         #
  95                         $title = Title::newFromText( $wgContLang->uc( $term ) );
  96                         if ( $title && $title->exists() ) {
  97                                 return $title;
  98                         }
  99
 100                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 101                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
 102                         if ( $title && $title->exists() ) {
 103                                 return $title;
 104                         }
 105
 106                         // Give hooks a chance at better match variants
 107                         $title = null;
 108                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 109                                 return $title;
 110                         }
 111                 }
 112
 113                 $title = Title::newFromText( $searchterm );
 114
 115                 # Entering an IP address goes to the contributions page
 116                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 117                         || User::isIP( trim( $searchterm ) ) ) {
 118                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 119                 }
 120
 121
 122                 # Entering a user goes to the user page whether it's there or not
 123                 if ( $title->getNamespace() == NS_USER ) {
 124                         return $title;
 125                 }
 126
 127                 # Go to images that exist even if there's no local page.
 128                 # There may have been a funny upload, or it may be on a shared
 129                 # file repository such as Wikimedia Commons.
 130                 if( $title->getNamespace() == NS_IMAGE ) {
 131                         $image = wfFindFile( $title );
 132                         if( $image ) {
 133                                 return $title;
 134                         }
 135                 }
 136
 137                 # MediaWiki namespace? Page may be "implied" if not customized.
 138                 # Just return it, with caps forced as the message system likes it.
 139                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 140                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 141                 }
 142
 143                 # Quoted term? Try without the quotes...
 144                 $matches = array();
 145                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 146                         return SearchEngine::getNearMatch( $matches[1] );
 147                 }
 148
 149                 return NULL;
 150         }
 151
 152         public static function legalSearchChars() {
 153                 return "A-Za-z_'0-9\\x80-\\xFF\\-";
 154         }
 155
 156         /**
 157          * Set the maximum number of results to return
 158          * and how many to skip before returning the first.
 159          *
 160          * @param int $limit
 161          * @param int $offset
 162          * @access public
 163          */
 164         function setLimitOffset( $limit, $offset = 0 ) {
 165                 $this->limit = intval( $limit );
 166                 $this->offset = intval( $offset );
 167         }
 168
 169         /**
 170          * Set which namespaces the search should include.
 171          * Give an array of namespace index numbers.
 172          *
 173          * @param array $namespaces
 174          * @access public
 175          */
 176         function setNamespaces( $namespaces ) {
 177                 $this->namespaces = $namespaces;
 178         }
 179
 180         /**
 181          * Parse some common prefixes: all (search everything)
 182          * or namespace names
 183          *
 184          * @param string $query
 185          */
 186         function replacePrefixes( $query ){
 187                 global $wgContLang;
 188
 189                 if( strpos($query,':') === false )
 190                         return $query; // nothing to do
 191
 192                 $parsed = $query;
 193                 $allkeyword = wfMsgForContent('searchall').":";
 194                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 195                         $this->namespaces = null;
 196                         $parsed = substr($query,strlen($allkeyword));
 197                 } else if( strpos($query,':') !== false ) {
 198                         $prefix = substr($query,0,strpos($query,':'));
 199                         $index = $wgContLang->getNsIndex($prefix);
 200                         if($index !== false){
 201                                 $this->namespaces = array($index);
 202                                 $parsed = substr($query,strlen($prefix)+1);
 203                         }
 204                 }
 205                 if(trim($parsed) == '')
 206                         return $query; // prefix was the whole query
 207
 208                 return $parsed;
 209         }
 210
 211         /**
 212          * Make a list of searchable namespaces and their canonical names.
 213          * @return array
 214          */
 215         public static function searchableNamespaces() {
 216                 global $wgContLang;
 217                 $arr = array();
 218                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 219                         if( $ns >= NS_MAIN ) {
 220                                 $arr[$ns] = $name;
 221                         }
 222                 }
 223                 return $arr;
 224         }
 225
 226         /**
 227          * Extract default namespaces to search from the given user's
 228          * settings, returning a list of index numbers.
 229          *
 230          * @param User $user
 231          * @return array
 232          * @static
 233          */
 234         public static function userNamespaces( &$user ) {
 235                 $arr = array();
 236                 // for logged-in users use predefined defaults
 237                 if( $user->isLoggedIn() && $user->getOption( 'defaultusersearch', true ) )
 238                         return SearchEngine::projectNamespaces();
 239
 240                 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
 241                         if( $user->getOption( 'searchNs' . $ns ) ) {
 242                                 $arr[] = $ns;
 243                         }
 244                 }
 245                 return $arr;
 246         }
 247
 248         /**
 249          * Find snippet highlight settings for a given user
 250          *
 251          * @param User $user
 252          * @return array contextlines, contextchars
 253          * @static
 254          */
 255         public static function userHighlightPrefs( &$user ){
 256                 //$contextlines = $user->getOption( 'contextlines',  5 );
 257                 //$contextchars = $user->getOption( 'contextchars', 50 );
 258                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 259                 $contextchars = 75; // same as above.... :P
 260                 return array($contextlines, $contextchars);
 261         }
 262
 263         /**
 264          * An array of namespaces indexes to be searched by default
 265          *
 266          * @return array
 267          * @static
 268          */
 269         public static function defaultNamespaces(){
 270                 global $wgNamespacesToBeSearchedDefault;
 271
 272                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 273         }
 274
 275         /**
 276          * Get a list of namespace names useful for showing in tooltips
 277          * and preferences
 278          *
 279          * @param unknown_type $namespaces
 280          */
 281         public static function namespacesAsText( $namespaces ){
 282                 global $wgContLang;
 283
 284                 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
 285                 foreach( $formatted as $key => $ns ){
 286                         if ( empty($ns) )
 287                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 288                 }
 289                 return $formatted;
 290         }
 291
 292         /**
 293          * An array of "project" namespaces indexes typically searched
 294          * by logged-in users
 295          *
 296          * @return array
 297          * @static
 298          */
 299         public static function projectNamespaces(){
 300                 global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
 301
 302                 return array_keys( $wgNamespacesToBeSearchedDefault +
 303                         $wgNamespacesToBeSearchedProject, true);
 304         }
 305
 306         /**
 307          * Return a 'cleaned up' search string
 308          *
 309          * @return string
 310          * @access public
 311          */
 312         function filter( $text ) {
 313                 $lc = $this->legalSearchChars();
 314                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 315         }
 316         /**
 317          * Load up the appropriate search engine class for the currently
 318          * active database backend, and return a configured instance.
 319          *
 320          * @return SearchEngine
 321          */
 322         public static function create() {
 323                 global $wgSearchType;
 324                 $dbr = wfGetDB( DB_SLAVE );
 325                 if( $wgSearchType ) {
 326                         $class = $wgSearchType;
 327                 } else {
 328                         $class = $dbr->getSearchEngine();
 329                 }
 330                 $search = new $class( $dbr );
 331                 $search->setLimitOffset(0,0);
 332                 return $search;
 333         }
 334
 335         /**
 336          * Create or update the search index record for the given page.
 337          * Title and text should be pre-processed.
 338          *
 339          * @param int $id
 340          * @param string $title
 341          * @param string $text
 342          * @abstract
 343          */
 344         function update( $id, $title, $text ) {
 345                 // no-op
 346         }
 347
 348         /**
 349          * Update a search index record's title only.
 350          * Title should be pre-processed.
 351          *
 352          * @param int $id
 353          * @param string $title
 354          * @abstract
 355          */
 356         function updateTitle( $id, $title ) {
 357                 // no-op
 358         }
 359
 360         /**
 361          * Get OpenSearch suggestion template
 362          *
 363          * @return string
 364          * @static
 365          */
 366         public static function getOpenSearchTemplate() {
 367                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 368                 if($wgOpenSearchTemplate)
 369                         return $wgOpenSearchTemplate;
 370                 else{
 371                         $ns = implode(',',SearchEngine::defaultNamespaces());
 372                         if(!$ns) $ns = "0";
 373                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 374                 }
 375         }
 376
 377         /**
 378          * Get internal MediaWiki Suggest template
 379          *
 380          * @return string
 381          * @static
 382          */
 383         public static function getMWSuggestTemplate() {
 384                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 385                 if($wgMWSuggestTemplate)
 386                         return $wgMWSuggestTemplate;
 387                 else
 388                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
 389         }
 390
 391 }
 392
 393 /**
 394  * @ingroup Search
 395  */
 396 class SearchResultSet {
 397         /**
 398          * Fetch an array of regular expression fragments for matching
 399          * the search terms as parsed by this engine in a text extract.
 400          *
 401          * @return array
 402          * @access public
 403          * @abstract
 404          */
 405         function termMatches() {
 406                 return array();
 407         }
 408
 409         function numRows() {
 410                 return 0;
 411         }
 412
 413         /**
 414          * Return true if results are included in this result set.
 415          * @return bool
 416          * @abstract
 417          */
 418         function hasResults() {
 419                 return false;
 420         }
 421
 422         /**
 423          * Some search modes return a total hit count for the query
 424          * in the entire article database. This may include pages
 425          * in namespaces that would not be matched on the given
 426          * settings.
 427          *
 428          * Return null if no total hits number is supported.
 429          *
 430          * @return int
 431          * @access public
 432          */
 433         function getTotalHits() {
 434                 return null;
 435         }
 436
 437         /**
 438          * Some search modes return a suggested alternate term if there are
 439          * no exact hits. Returns true if there is one on this set.
 440          *
 441          * @return bool
 442          * @access public
 443          */
 444         function hasSuggestion() {
 445                 return false;
 446         }
 447
 448         /**
 449          * @return string suggested query, null if none
 450          */
 451         function getSuggestionQuery(){
 452                 return null;
 453         }
 454
 455         /**
 456          * @return string highlighted suggested query, '' if none
 457          */
 458         function getSuggestionSnippet(){
 459                 return '';
 460         }
 461
 462         /**
 463          * Return information about how and from where the results were fetched,
 464          * should be useful for diagnostics and debugging
 465          *
 466          * @return string
 467          */
 468         function getInfo() {
 469                 return null;
 470         }
 471
 472         /**
 473          * Return a result set of hits on other (multiple) wikis associated with this one
 474          *
 475          * @return SearchResultSet
 476          */
 477         function getInterwikiResults() {
 478                 return null;
 479         }
 480
 481         /**
 482          * Check if there are results on other wikis
 483          *
 484          * @return boolean
 485          */
 486         function hasInterwikiResults() {
 487                 return $this->getInterwikiResults() != null;
 488         }
 489
 490
 491         /**
 492          * Fetches next search result, or false.
 493          * @return SearchResult
 494          * @access public
 495          * @abstract
 496          */
 497         function next() {
 498                 return false;
 499         }
 500
 501         /**
 502          * Frees the result set, if applicable.
 503          * @ access public
 504          */
 505         function free() {
 506                 // ...
 507         }
 508 }
 509
 510
 511 /**
 512  * @ingroup Search
 513  */
 514 class SearchResultTooMany {
 515         ## Some search engines may bail out if too many matches are found
 516 }
 517
 518
 519 /**
 520  * @fixme This class is horribly factored. It would probably be better to have
 521  * a useful base class to which you pass some standard information, then let
 522  * the fancy self-highlighters extend that.
 523  * @ingroup Search
 524  */
 525 class SearchResult {
 526         var $mRevision = null;
 527         var $mImage = null;
 528
 529         function SearchResult( $row ) {
 530                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 531                 if( !is_null($this->mTitle) ){
 532                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 533                         if($this->mTitle->getNamespace() == NS_IMAGE)
 534                                 $this->mImage = wfFindFile( $this->mTitle );
 535                 }
 536
 537
 538         }
 539
 540         /**
 541          * Check if this is result points to an invalid title
 542          *
 543          * @return boolean
 544          * @access public
 545          */
 546         function isBrokenTitle(){
 547                 if( is_null($this->mTitle) )
 548                         return true;
 549                 return false;
 550         }
 551
 552         /**
 553          * Check if target page is missing, happens when index is out of date
 554          *
 555          * @return boolean
 556          * @access public
 557          */
 558         function isMissingRevision(){
 559                 if( !$this->mRevision && !$this->mImage )
 560                         return true;
 561                 return false;
 562         }
 563
 564         /**
 565          * @return Title
 566          * @access public
 567          */
 568         function getTitle() {
 569                 return $this->mTitle;
 570         }
 571
 572         /**
 573          * @return double or null if not supported
 574          */
 575         function getScore() {
 576                 return null;
 577         }
 578
 579         /**
 580          * Lazy initialization of article text from DB
 581          */
 582         protected function initText(){
 583                 if( !isset($this->mText) ){
 584                         if($this->mRevision != null)
 585                                 $this->mText = $this->mRevision->getText();
 586                         else
 587                                 $this->mText = '';
 588                 }
 589         }
 590
 591         /**
 592          * @param array $terms terms to highlight
 593          * @return string highlighted text snippet, null (and not '') if not supported
 594          */
 595         function getTextSnippet($terms){
 596                 global $wgUser, $wgAdvancedSearchHighlighting;
 597                 $this->initText();
 598                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 599                 $h = new SearchHighlighter();
 600                 if( $wgAdvancedSearchHighlighting )
 601                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 602                 else
 603                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 604         }
 605
 606         /**
 607          * @param array $terms terms to highlight
 608          * @return string highlighted title, '' if not supported
 609          */
 610         function getTitleSnippet($terms){
 611                 return '';
 612         }
 613
 614         /**
 615          * @param array $terms terms to highlight
 616          * @return string highlighted redirect name (redirect to this page), '' if none or not supported
 617          */
 618         function getRedirectSnippet($terms){
 619                 return '';
 620         }
 621
 622         /**
 623          * @return Title object for the redirect to this page, null if none or not supported
 624          */
 625         function getRedirectTitle(){
 626                 return null;
 627         }
 628
 629         /**
 630          * @return string highlighted relevant section name, null if none or not supported
 631          */
 632         function getSectionSnippet(){
 633                 return '';
 634         }
 635
 636         /**
 637          * @return Title object (pagename+fragment) for the section, null if none or not supported
 638          */
 639         function getSectionTitle(){
 640                 return null;
 641         }
 642
 643         /**
 644          * @return string timestamp
 645          */
 646         function getTimestamp(){
 647                 if($this->mRevision != null)
 648                         return $this->mRevision->getTimestamp();
 649                 else
 650                         return '';
 651         }
 652
 653         /**
 654          * @return int number of words
 655          */
 656         function getWordCount(){
 657                 $this->initText();
 658                 return str_word_count( $this->mText );
 659         }
 660
 661         /**
 662          * @return int size in bytes
 663          */
 664         function getByteSize(){
 665                 $this->initText();
 666                 return strlen( $this->mText );
 667         }
 668
 669         /**
 670          * @return boolean if hit has related articles
 671          */
 672         function hasRelated(){
 673                 return false;
 674         }
 675
 676         /**
 677          * @return interwiki prefix of the title (return iw even if title is broken)
 678          */
 679         function getInterwikiPrefix(){
 680                 return '';
 681         }
 682 }
 683
 684 /**
 685  * Highlight bits of wikitext
 686  *
 687  * @ingroup Search
 688  */
 689 class SearchHighlighter {
 690         var $mCleanWikitext = true;
 691
 692         function SearchHighlighter($cleanupWikitext = true){
 693                 $this->mCleanWikitext = $cleanupWikitext;
 694         }
 695
 696         /**
 697          * Default implementation of wikitext highlighting
 698          *
 699          * @param string $text
 700          * @param array $terms Terms to highlight (unescaped)
 701          * @param int $contextlines
 702          * @param int $contextchars
 703          * @return string
 704          */
 705         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 706                 global $wgLang, $wgContLang;
 707                 global $wgSearchHighlightBoundaries;
 708                 $fname = __METHOD__;
 709
 710                 if($text == '')
 711                         return '';
 712
 713                 // spli text into text + templates/links/tables
 714                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 715                 // first capture group is for detecting nested templates/links/tables/references
 716                 $endPatterns = array(
 717                         1 => '/(\{\{)|(\}\})/', // template
 718                         2 => '/(\[\[)|(\]\])/', // image
 719                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 720
 721                 // FIXME: this should prolly be a hook or something
 722                 if(function_exists('wfCite')){
 723                         $spat .= '|(<ref>)'; // references via cite extension
 724                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 725                 }
 726                 $spat .= '/';
 727                 $textExt = array(); // text extracts
 728                 $otherExt = array();  // other extracts
 729                 wfProfileIn( "$fname-split" );
 730                 $start = 0;
 731                 $textLen = strlen($text);
 732                 $count = 0; // sequence number to maintain ordering
 733                 while( $start < $textLen ){
 734                         // find start of template/image/table
 735                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 736                                 $epat = '';
 737                                 foreach($matches as $key => $val){
 738                                         if($key > 0 && $val[1] != -1){
 739                                                 if($key == 2){
 740                                                         // see if this is an image link
 741                                                         $ns = substr($val[0],2,-1);
 742                                                         if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
 743                                                                 break;
 744
 745                                                 }
 746                                                 $epat = $endPatterns[$key];
 747                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 748                                                 $start = $val[1];
 749                                                 break;
 750                                         }
 751                                 }
 752                                 if( $epat ){
 753                                         // find end (and detect any nested elements)
 754                                         $level = 0;
 755                                         $offset = $start + 1;
 756                                         $found = false;
 757                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 758                                                 if( array_key_exists(2,$endMatches) ){
 759                                                         // found end
 760                                                         if($level == 0){
 761                                                                 $len = strlen($endMatches[2][0]);
 762                                                                 $off = $endMatches[2][1];
 763                                                                 $this->splitAndAdd( $otherExt, $count,
 764                                                                         substr( $text, $start, $off + $len  - $start ) );
 765                                                                 $start = $off + $len;
 766                                                                 $found = true;
 767                                                                 break;
 768                                                         } else{
 769                                                                 // end of nested element
 770                                                                 $level -= 1;
 771                                                         }
 772                                                 } else{
 773                                                         // nested
 774                                                         $level += 1;
 775                                                 }
 776                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 777                                         }
 778                                         if( ! $found ){
 779                                                 // couldn't find appropriate closing tag, skip
 780                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 781                                                 $start += strlen($matches[0][0]);
 782                                         }
 783                                         continue;
 784                                 }
 785                         }
 786                         // else: add as text extract
 787                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 788                         break;
 789                 }
 790
 791                 $all = $textExt + $otherExt; // these have disjunct key sets
 792
 793                 wfProfileOut( "$fname-split" );
 794
 795                 // prepare regexps
 796                 foreach( $terms as $index => $term ) {
 797                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 798                         if(preg_match('/[\x80-\xff]/', $term) ){
 799                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 800                         } else {
 801                                 $terms[$index] = $term;
 802                         }
 803                 }
 804                 $anyterm = implode( '|', $terms );
 805                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 806
 807                 // FIXME: a hack to scale contextchars, a correct solution
 808                 // would be to have contextchars actually be char and not byte
 809                 // length, and do proper utf-8 substrings and lengths everywhere,
 810                 // but PHP is making that very hard and unclean to implement :(
 811                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 812                 $contextchars = intval( $contextchars * $scale );
 813
 814                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 815                 $patPost = "($wgSearchHighlightBoundaries|$)";
 816
 817                 $pat1 = "/(".$phrase.")/ui";
 818                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 819
 820                 wfProfileIn( "$fname-extract" );
 821
 822                 $left = $contextlines;
 823
 824                 $snippets = array();
 825                 $offsets = array();
 826
 827                 // show beginning only if it contains all words
 828                 $first = 0;
 829                 $firstText = '';
 830                 foreach($textExt as $index => $line){
 831                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 832                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 833                                 $first = $index;
 834                                 break;
 835                         }
 836                 }
 837                 if( $firstText ){
 838                         $succ = true;
 839                         // check if first text contains all terms
 840                         foreach($terms as $term){
 841                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 842                                         $succ = false;
 843                                         break;
 844                                 }
 845                         }
 846                         if( $succ ){
 847                                 $snippets[$first] = $firstText;
 848                                 $offsets[$first] = 0;
 849                         }
 850                 }
 851                 if( ! $snippets ) {
 852                         // match whole query on text
 853                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 854                         // match whole query on templates/tables/images
 855                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 856                         // match any words on text
 857                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 858                         // match any words on templates/tables/images
 859                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 860
 861                         ksort($snippets);
 862                 }
 863
 864                 // add extra chars to each snippet to make snippets constant size
 865                 $extended = array();
 866                 if( count( $snippets ) == 0){
 867                         // couldn't find the target words, just show beginning of article
 868                         $targetchars = $contextchars * $contextlines;
 869                         $snippets[$first] = '';
 870                         $offsets[$first] = 0;
 871                 } else{
 872                         // if begin of the article contains the whole phrase, show only that !!
 873                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 874                             && $offsets[$first] < $contextchars * 2 ){
 875                                 $snippets = array ($first => $snippets[$first]);
 876                         }
 877
 878                         // calc by how much to extend existing snippets
 879                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 880                 }
 881
 882                 foreach($snippets as $index => $line){
 883                         $extended[$index] = $line;
 884                         $len = strlen($line);
 885                         if( $len < $targetchars - 20 ){
 886                                 // complete this line
 887                                 if($len < strlen( $all[$index] )){
 888                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 889                                         $len = strlen( $extended[$index] );
 890                                 }
 891
 892                                 // add more lines
 893                                 $add = $index + 1;
 894                                 while( $len < $targetchars - 20
 895                                        && array_key_exists($add,$all)
 896                                        && !array_key_exists($add,$snippets) ){
 897                                     $offsets[$add] = 0;
 898                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 899                                         $extended[$add] = $tt;
 900                                         $len += strlen( $tt );
 901                                         $add++;
 902                                 }
 903                         }
 904                 }
 905
 906                 //$snippets = array_map('htmlspecialchars', $extended);
 907                 $snippets = $extended;
 908                 $last = -1;
 909                 $extract = '';
 910                 foreach($snippets as $index => $line){
 911                         if($last == -1)
 912                                 $extract .= $line; // first line
 913                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 914                                 $extract .= " ".$line; // continous lines
 915                         else
 916                                 $extract .= '<b> ... </b>' . $line;
 917
 918                         $last = $index;
 919                 }
 920                 if( $extract )
 921                         $extract .= '<b> ... </b>';
 922
 923                 $processed = array();
 924                 foreach($terms as $term){
 925                         if( ! isset($processed[$term]) ){
 926                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 927                                 $extract = preg_replace( $pat3,
 928                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 929                                 $processed[$term] = true;
 930                         }
 931                 }
 932
 933                 wfProfileOut( "$fname-extract" );
 934
 935                 return $extract;
 936         }
 937
 938         /**
 939          * Split text into lines and add it to extracts array
 940          *
 941          * @param array $extracts index -> $line
 942          * @param int $count
 943          * @param string $text
 944          */
 945         function splitAndAdd(&$extracts, &$count, $text){
 946                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 947                 foreach($split as $line){
 948                         $tt = trim($line);
 949                         if( $tt )
 950                                 $extracts[$count++] = $tt;
 951                 }
 952         }
 953
 954         /**
 955          * Do manual case conversion for non-ascii chars
 956          *
 957          * @param unknown_type $matches
 958          */
 959         function caseCallback($matches){
 960                 global $wgContLang;
 961                 if( strlen($matches[0]) > 1 ){
 962                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 963                 } else
 964                         return $matches[0];
 965         }
 966
 967         /**
 968          * Extract part of the text from start to end, but by
 969          * not chopping up words
 970          * @param string $text
 971          * @param int $start
 972          * @param int $end
 973          * @param int $posStart (out) actual start position
 974          * @param int $posEnd (out) actual end position
 975          * @return string
 976          */
 977         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 978                 global $wgContLang;
 979
 980                 if( $start != 0)
 981                         $start = $this->position( $text, $start, 1 );
 982                 if( $end >= strlen($text) )
 983                         $end = strlen($text);
 984                 else
 985                         $end = $this->position( $text, $end );
 986
 987                 if(!is_null($posStart))
 988                         $posStart = $start;
 989                 if(!is_null($posEnd))
 990                         $posEnd = $end;
 991
 992                 if($end > $start)
 993                         return substr($text, $start, $end-$start);
 994                 else
 995                         return '';
 996         }
 997
 998         /**
 999          * Find a nonletter near a point (index) in the text
1000          *
1001          * @param string $text
1002          * @param int $point
1003          * @param int $offset to found index
1004          * @return int nearest nonletter index, or beginning of utf8 char if none
1005          */
1006         function position($text, $point, $offset=0 ){
1007                 $tolerance = 10;
1008                 $s = max( 0, $point - $tolerance );
1009                 $l = min( strlen($text), $point + $tolerance ) - $s;
1010                 $m = array();
1011                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1012                         return $m[0][1] + $s + $offset;
1013                 } else{
1014                         // check if point is on a valid first UTF8 char
1015                         $char = ord( $text[$point] );
1016                         while( $char >= 0x80 && $char < 0xc0 ) {
1017                                 // skip trailing bytes
1018                                 $point++;
1019                                 if($point >= strlen($text))
1020                                         return strlen($text);
1021                                 $char = ord( $text[$point] );
1022                         }
1023                         return $point;
1024
1025                 }
1026         }
1027
1028         /**
1029          * Search extracts for a pattern, and return snippets
1030          *
1031          * @param string $pattern regexp for matching lines
1032          * @param array $extracts extracts to search
1033          * @param int $linesleft number of extracts to make
1034          * @param int $contextchars length of snippet
1035          * @param array $out map for highlighted snippets
1036          * @param array $offsets map of starting points of snippets
1037          * @protected
1038          */
1039         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1040                 if($linesleft == 0)
1041                         return; // nothing to do
1042                 foreach($extracts as $index => $line){
1043                         if( array_key_exists($index,$out) )
1044                                 continue; // this line already highlighted
1045
1046                         $m = array();
1047                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1048                                 continue;
1049
1050                         $offset = $m[0][1];
1051                         $len = strlen($m[0][0]);
1052                         if($offset + $len < $contextchars)
1053                                 $begin = 0;
1054                         elseif( $len > $contextchars)
1055                                 $begin = $offset;
1056                         else
1057                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1058
1059                         $end = $begin + $contextchars;
1060
1061                         $posBegin = $begin;
1062                         // basic snippet from this line
1063                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1064                         $offsets[$index] = $posBegin;
1065                         $linesleft--;
1066                         if($linesleft == 0)
1067                                 return;
1068                 }
1069         }
1070
1071         /**
1072          * Basic wikitext removal
1073          * @protected
1074          */
1075         function removeWiki($text) {
1076                 $fname = __METHOD__;
1077                 wfProfileIn( $fname );
1078
1079                 //$text = preg_replace("/'{2,5}/", "", $text);
1080                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1081                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1082                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1083                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1084                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1085                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1086                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1087                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1088                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1089                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1090                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1091                 $text = preg_replace("/'''''/", "", $text);
1092                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1093                 $text = preg_replace("/''/", "", $text);
1094
1095                 wfProfileOut( $fname );
1096                 return $text;
1097         }
1098
1099         /**
1100          * callback to replace [[target|caption]] kind of links, if
1101          * the target is category or image, leave it
1102          *
1103          * @param array $matches
1104          */
1105         function linkReplace($matches){
1106                 $colon = strpos( $matches[1], ':' );
1107                 if( $colon === false )
1108                         return $matches[2]; // replace with caption
1109                 global $wgContLang;
1110                 $ns = substr( $matches[1], 0, $colon );
1111                 $index = $wgContLang->getNsIndex($ns);
1112                 if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
1113                         return $matches[0]; // return the whole thing
1114                 else
1115                         return $matches[2];
1116
1117         }
1118
1119         /**
1120      * Simple & fast snippet extraction, but gives completely unrelevant
1121      * snippets
1122      *
1123      * @param string $text
1124      * @param array $terms
1125      * @param int $contextlines
1126      * @param int $contextchars
1127      * @return string
1128      */
1129     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1130         global $wgLang, $wgContLang;
1131         $fname = __METHOD__;
1132
1133         $lines = explode( "\n", $text );
1134
1135         $terms = implode( '|', $terms );
1136         $max = intval( $contextchars ) + 1;
1137         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1138
1139         $lineno = 0;
1140
1141         $extract = "";
1142         wfProfileIn( "$fname-extract" );
1143         foreach ( $lines as $line ) {
1144             if ( 0 == $contextlines ) {
1145                 break;
1146             }
1147             ++$lineno;
1148             $m = array();
1149             if ( ! preg_match( $pat1, $line, $m ) ) {
1150                 continue;
1151             }
1152             --$contextlines;
1153             $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
1154
1155             if ( count( $m ) < 3 ) {
1156                 $post = '';
1157             } else {
1158                 $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
1159             }
1160
1161             $found = $m[2];
1162
1163             $line = htmlspecialchars( $pre . $found . $post );
1164             $pat2 = '/(' . $terms . ")/i";
1165             $line = preg_replace( $pat2,
1166               "<span class='searchmatch'>\\1</span>", $line );
1167
1168             $extract .= "${line}\n";
1169         }
1170         wfProfileOut( "$fname-extract" );
1171
1172         return $extract;
1173     }
1174
1175 }
1176
1177 /**
1178  * Dummy class to be used when non-supported Database engine is present.
1179  * @fixme Dummy class should probably try something at least mildly useful,
1180  * such as a LIKE search through titles.
1181  * @ingroup Search
1182  */
1183 class SearchEngineDummy extends SearchEngine {
1184         // no-op
1185 }