includes/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $prefix = '';
  17         var $searchTerms = array();
  18         var $namespaces = array( NS_MAIN );
  19         var $showRedirects = false;
  20
  21         /**
  22          * Perform a full text search query and return a result set.
  23          * If title searches are not supported or disabled, return null.
  24          * STUB
  25          *
  26          * @param $term String: raw search term
  27          * @return SearchResultSet
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          * STUB
  37          *
  38          * @param $term String: raw search term
  39          * @return SearchResultSet
  40          */
  41         function searchTitle( $term ) {
  42                 return null;
  43         }
  44
  45         /** If this search backend can list/unlist redirects */
  46         function acceptListRedirects() {
  47                 return true;
  48         }
  49
  50         /**
  51          * Transform search term in cases when parts of the query came as different GET params (when supported)
  52          * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
  53          */
  54         function transformSearchTerm( $term ) {
  55                 return $term;
  56         }
  57
  58         /**
  59          * If an exact title match can be find, or a very slightly close match,
  60          * return the title. If no match, returns NULL.
  61          *
  62          * @param $searchterm String
  63          * @return Title
  64          */
  65         public static function getNearMatch( $searchterm ) {
  66                 global $wgContLang, $wgSecondaryGoNamespaces;
  67
  68                 $allSearchTerms = array($searchterm);
  69
  70                 if($wgContLang->hasVariants()){
  71                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  72                 }
  73
  74                 foreach($allSearchTerms as $term){
  75
  76                         # Exact match? No need to look further.
  77                         $title = Title::newFromText( $term );
  78                         if (is_null($title))
  79                                 return NULL;
  80
  81                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
  82                                 return $title;
  83                         }
  84
  85                         # See if it still otherwise has content is some sane sense
  86                         $article = MediaWiki::articleFromTitle( $title );
  87                         if( $article->hasViewableContent() ) {
  88                                 return $title;
  89                         }
  90
  91                         # If a match is not found in the main namespace look in secondary go namespaces.
  92                         if( $wgSecondaryGoNamespaces && $title->getNamespace() == NS_MAIN ) {
  93                                 foreach( $wgSecondaryGoNamespaces as $ns ) {
  94                                         $title = Title::newFromText( $term, $ns );
  95                                         if( $title && $title->exists() ) return $title;
  96                                 }
  97                         }
  98
  99                         # Now try all lower case (i.e. first letter capitalized)
 100                         #
 101                         $title = Title::newFromText( $wgContLang->lc( $term ) );
 102                         if ( $title && $title->exists() ) {
 103                                 return $title;
 104                         }
 105
 106                         # Now try capitalized string
 107                         #
 108                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
 109                         if ( $title && $title->exists() ) {
 110                                 return $title;
 111                         }
 112
 113                         # Now try all upper case
 114                         #
 115                         $title = Title::newFromText( $wgContLang->uc( $term ) );
 116                         if ( $title && $title->exists() ) {
 117                                 return $title;
 118                         }
 119
 120                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 121                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
 122                         if ( $title && $title->exists() ) {
 123                                 return $title;
 124                         }
 125
 126                         // Give hooks a chance at better match variants
 127                         $title = null;
 128                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 129                                 return $title;
 130                         }
 131                 }
 132
 133                 $title = Title::newFromText( $searchterm );
 134
 135                 # Entering an IP address goes to the contributions page
 136                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 137                         || User::isIP( trim( $searchterm ) ) ) {
 138                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 139                 }
 140
 141
 142                 # Entering a user goes to the user page whether it's there or not
 143                 if ( $title->getNamespace() == NS_USER ) {
 144                         return $title;
 145                 }
 146
 147                 # Go to images that exist even if there's no local page.
 148                 # There may have been a funny upload, or it may be on a shared
 149                 # file repository such as Wikimedia Commons.
 150                 if( $title->getNamespace() == NS_FILE ) {
 151                         $image = wfFindFile( $title );
 152                         if( $image ) {
 153                                 return $title;
 154                         }
 155                 }
 156
 157                 # MediaWiki namespace? Page may be "implied" if not customized.
 158                 # Just return it, with caps forced as the message system likes it.
 159                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 160                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 161                 }
 162
 163                 # Quoted term? Try without the quotes...
 164                 $matches = array();
 165                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 166                         return SearchEngine::getNearMatch( $matches[1] );
 167                 }
 168
 169                 return NULL;
 170         }
 171
 172         public static function legalSearchChars() {
 173                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
 174         }
 175
 176         /**
 177          * Set the maximum number of results to return
 178          * and how many to skip before returning the first.
 179          *
 180          * @param $limit Integer
 181          * @param $offset Integer
 182          */
 183         function setLimitOffset( $limit, $offset = 0 ) {
 184                 $this->limit = intval( $limit );
 185                 $this->offset = intval( $offset );
 186         }
 187
 188         /**
 189          * Set which namespaces the search should include.
 190          * Give an array of namespace index numbers.
 191          *
 192          * @param $namespaces Array
 193          */
 194         function setNamespaces( $namespaces ) {
 195                 $this->namespaces = $namespaces;
 196         }
 197
 198         /**
 199          * Parse some common prefixes: all (search everything)
 200          * or namespace names
 201          *
 202          * @param $query String
 203          */
 204         function replacePrefixes( $query ){
 205                 global $wgContLang;
 206
 207                 if( strpos($query,':') === false )
 208                         return $query; // nothing to do
 209
 210                 $parsed = $query;
 211                 $allkeyword = wfMsgForContent('searchall').":";
 212                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 213                         $this->namespaces = null;
 214                         $parsed = substr($query,strlen($allkeyword));
 215                 } else if( strpos($query,':') !== false ) {
 216                         $prefix = substr($query,0,strpos($query,':'));
 217                         $index = $wgContLang->getNsIndex($prefix);
 218                         if($index !== false){
 219                                 $this->namespaces = array($index);
 220                                 $parsed = substr($query,strlen($prefix)+1);
 221                         }
 222                 }
 223                 if(trim($parsed) == '')
 224                         return $query; // prefix was the whole query
 225
 226                 return $parsed;
 227         }
 228
 229         /**
 230          * Make a list of searchable namespaces and their canonical names.
 231          * @return Array
 232          */
 233         public static function searchableNamespaces() {
 234                 global $wgContLang;
 235                 $arr = array();
 236                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 237                         if( $ns >= NS_MAIN ) {
 238                                 $arr[$ns] = $name;
 239                         }
 240                 }
 241                 return $arr;
 242         }
 243
 244         /**
 245          * Extract default namespaces to search from the given user's
 246          * settings, returning a list of index numbers.
 247          *
 248          * @param $user User
 249          * @return Array
 250          */
 251         public static function userNamespaces( $user ) {
 252                 global $wgSearchEverythingOnlyLoggedIn;
 253
 254                 // get search everything preference, that can be set to be read for logged-in users
 255                 $searcheverything = false;
 256                 if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
 257                     || !$wgSearchEverythingOnlyLoggedIn )
 258                         $searcheverything = $user->getOption('searcheverything');
 259
 260                 // searcheverything overrides other options
 261                 if( $searcheverything )
 262                         return array_keys(SearchEngine::searchableNamespaces());
 263
 264                 $arr = Preferences::loadOldSearchNs( $user );
 265                 $searchableNamespaces = SearchEngine::searchableNamespaces();
 266
 267                 $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter
 268
 269                 return $arr;
 270         }
 271
 272         /**
 273          * Find snippet highlight settings for a given user
 274          *
 275          * @param $user User
 276          * @return Array contextlines, contextchars
 277          */
 278         public static function userHighlightPrefs( &$user ){
 279                 //$contextlines = $user->getOption( 'contextlines',  5 );
 280                 //$contextchars = $user->getOption( 'contextchars', 50 );
 281                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 282                 $contextchars = 75; // same as above.... :P
 283                 return array($contextlines, $contextchars);
 284         }
 285
 286         /**
 287          * An array of namespaces indexes to be searched by default
 288          *
 289          * @return Array
 290          */
 291         public static function defaultNamespaces(){
 292                 global $wgNamespacesToBeSearchedDefault;
 293
 294                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 295         }
 296
 297         /**
 298          * Get a list of namespace names useful for showing in tooltips
 299          * and preferences
 300          *
 301          * @param $namespaces Array
 302          */
 303         public static function namespacesAsText( $namespaces ){
 304                 global $wgContLang;
 305
 306                 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
 307                 foreach( $formatted as $key => $ns ){
 308                         if ( empty($ns) )
 309                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 310                 }
 311                 return $formatted;
 312         }
 313
 314         /**
 315          * Return the help namespaces to be shown on Special:Search
 316          *
 317          * @return Array
 318          */
 319         public static function helpNamespaces() {
 320                 global $wgNamespacesToBeSearchedHelp;
 321
 322                 return array_keys( $wgNamespacesToBeSearchedHelp, true );
 323         }
 324
 325         /**
 326          * Return a 'cleaned up' search string
 327          *
 328          * @param $text String
 329          * @return String
 330          */
 331         function filter( $text ) {
 332                 $lc = $this->legalSearchChars();
 333                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 334         }
 335         /**
 336          * Load up the appropriate search engine class for the currently
 337          * active database backend, and return a configured instance.
 338          *
 339          * @return SearchEngine
 340          */
 341         public static function create() {
 342                 global $wgSearchType;
 343                 $dbr = wfGetDB( DB_SLAVE );
 344                 if( $wgSearchType ) {
 345                         $class = $wgSearchType;
 346                 } else {
 347                         $class = $dbr->getSearchEngine();
 348                 }
 349                 $search = new $class( $dbr );
 350                 $search->setLimitOffset(0,0);
 351                 return $search;
 352         }
 353
 354         /**
 355          * Create or update the search index record for the given page.
 356          * Title and text should be pre-processed.
 357          * STUB
 358          *
 359          * @param $id Integer
 360          * @param $title String
 361          * @param $text String
 362          */
 363         function update( $id, $title, $text ) {
 364                 // no-op
 365         }
 366
 367         /**
 368          * Update a search index record's title only.
 369          * Title should be pre-processed.
 370          * STUB
 371          *
 372          * @param $id Integer
 373          * @param $title String
 374          */
 375         function updateTitle( $id, $title ) {
 376                 // no-op
 377         }
 378
 379         /**
 380          * Get OpenSearch suggestion template
 381          *
 382          * @return String
 383          */
 384         public static function getOpenSearchTemplate() {
 385                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 386                 if( $wgOpenSearchTemplate )     {
 387                         return $wgOpenSearchTemplate;
 388                 } else {
 389                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
 390                         if( !$ns ) $ns = "0";
 391                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 392                 }
 393         }
 394
 395         /**
 396          * Get internal MediaWiki Suggest template
 397          *
 398          * @return String
 399          */
 400         public static function getMWSuggestTemplate() {
 401                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 402                 if($wgMWSuggestTemplate)
 403                         return $wgMWSuggestTemplate;
 404                 else
 405                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
 406         }
 407 }
 408
 409 /**
 410  * @ingroup Search
 411  */
 412 class SearchResultSet {
 413         /**
 414          * Fetch an array of regular expression fragments for matching
 415          * the search terms as parsed by this engine in a text extract.
 416          * STUB
 417          *
 418          * @return Array
 419          */
 420         function termMatches() {
 421                 return array();
 422         }
 423
 424         function numRows() {
 425                 return 0;
 426         }
 427
 428         /**
 429          * Return true if results are included in this result set.
 430          * STUB
 431          *
 432          * @return Boolean
 433          */
 434         function hasResults() {
 435                 return false;
 436         }
 437
 438         /**
 439          * Some search modes return a total hit count for the query
 440          * in the entire article database. This may include pages
 441          * in namespaces that would not be matched on the given
 442          * settings.
 443          *
 444          * Return null if no total hits number is supported.
 445          *
 446          * @return Integer
 447          */
 448         function getTotalHits() {
 449                 return null;
 450         }
 451
 452         /**
 453          * Some search modes return a suggested alternate term if there are
 454          * no exact hits. Returns true if there is one on this set.
 455          *
 456          * @return Boolean
 457          */
 458         function hasSuggestion() {
 459                 return false;
 460         }
 461
 462         /**
 463          * @return String: suggested query, null if none
 464          */
 465         function getSuggestionQuery(){
 466                 return null;
 467         }
 468
 469         /**
 470          * @return String: HTML highlighted suggested query, '' if none
 471          */
 472         function getSuggestionSnippet(){
 473                 return '';
 474         }
 475
 476         /**
 477          * Return information about how and from where the results were fetched,
 478          * should be useful for diagnostics and debugging
 479          *
 480          * @return String
 481          */
 482         function getInfo() {
 483                 return null;
 484         }
 485
 486         /**
 487          * Return a result set of hits on other (multiple) wikis associated with this one
 488          *
 489          * @return SearchResultSet
 490          */
 491         function getInterwikiResults() {
 492                 return null;
 493         }
 494
 495         /**
 496          * Check if there are results on other wikis
 497          *
 498          * @return Boolean
 499          */
 500         function hasInterwikiResults() {
 501                 return $this->getInterwikiResults() != null;
 502         }
 503
 504
 505         /**
 506          * Fetches next search result, or false.
 507          * STUB
 508          *
 509          * @return SearchResult
 510          */
 511         function next() {
 512                 return false;
 513         }
 514
 515         /**
 516          * Frees the result set, if applicable.
 517          */
 518         function free() {
 519                 // ...
 520         }
 521 }
 522
 523
 524 /**
 525  * @ingroup Search
 526  */
 527 class SearchResultTooMany {
 528         ## Some search engines may bail out if too many matches are found
 529 }
 530
 531
 532 /**
 533  * @todo Fixme: This class is horribly factored. It would probably be better to
 534  * have a useful base class to which you pass some standard information, then
 535  * let the fancy self-highlighters extend that.
 536  * @ingroup Search
 537  */
 538 class SearchResult {
 539         var $mRevision = null;
 540         var $mImage = null;
 541
 542         function __construct( $row ) {
 543                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 544                 if( !is_null($this->mTitle) ){
 545                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 546                         if( $this->mTitle->getNamespace() === NS_FILE )
 547                                 $this->mImage = wfFindFile( $this->mTitle );
 548                 }
 549         }
 550
 551         /**
 552          * Check if this is result points to an invalid title
 553          *
 554          * @return Boolean
 555          */
 556         function isBrokenTitle(){
 557                 if( is_null($this->mTitle) )
 558                         return true;
 559                 return false;
 560         }
 561
 562         /**
 563          * Check if target page is missing, happens when index is out of date
 564          *
 565          * @return Boolean
 566          */
 567         function isMissingRevision(){
 568                 return !$this->mRevision && !$this->mImage;
 569         }
 570
 571         /**
 572          * @return Title
 573          */
 574         function getTitle() {
 575                 return $this->mTitle;
 576         }
 577
 578         /**
 579          * @return Double or null if not supported
 580          */
 581         function getScore() {
 582                 return null;
 583         }
 584
 585         /**
 586          * Lazy initialization of article text from DB
 587          */
 588         protected function initText(){
 589                 if( !isset($this->mText) ){
 590                         if($this->mRevision != null)
 591                                 $this->mText = $this->mRevision->getText();
 592                         else // TODO: can we fetch raw wikitext for commons images?
 593                                 $this->mText = '';
 594
 595                 }
 596         }
 597
 598         /**
 599          * @param $terms Array: terms to highlight
 600          * @return String: highlighted text snippet, null (and not '') if not supported
 601          */
 602         function getTextSnippet($terms){
 603                 global $wgUser, $wgAdvancedSearchHighlighting;
 604                 $this->initText();
 605                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 606                 $h = new SearchHighlighter();
 607                 if( $wgAdvancedSearchHighlighting )
 608                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 609                 else
 610                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 611         }
 612
 613         /**
 614          * @param $terms Array: terms to highlight
 615          * @return String: highlighted title, '' if not supported
 616          */
 617         function getTitleSnippet($terms){
 618                 return '';
 619         }
 620
 621         /**
 622          * @param $terms Array: terms to highlight
 623          * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
 624          */
 625         function getRedirectSnippet($terms){
 626                 return '';
 627         }
 628
 629         /**
 630          * @return Title object for the redirect to this page, null if none or not supported
 631          */
 632         function getRedirectTitle(){
 633                 return null;
 634         }
 635
 636         /**
 637          * @return string highlighted relevant section name, null if none or not supported
 638          */
 639         function getSectionSnippet(){
 640                 return '';
 641         }
 642
 643         /**
 644          * @return Title object (pagename+fragment) for the section, null if none or not supported
 645          */
 646         function getSectionTitle(){
 647                 return null;
 648         }
 649
 650         /**
 651          * @return String: timestamp
 652          */
 653         function getTimestamp(){
 654                 if( $this->mRevision )
 655                         return $this->mRevision->getTimestamp();
 656                 else if( $this->mImage )
 657                         return $this->mImage->getTimestamp();
 658                 return '';
 659         }
 660
 661         /**
 662          * @return Integer: number of words
 663          */
 664         function getWordCount(){
 665                 $this->initText();
 666                 return str_word_count( $this->mText );
 667         }
 668
 669         /**
 670          * @return Integer: size in bytes
 671          */
 672         function getByteSize(){
 673                 $this->initText();
 674                 return strlen( $this->mText );
 675         }
 676
 677         /**
 678          * @return Boolean if hit has related articles
 679          */
 680         function hasRelated(){
 681                 return false;
 682         }
 683
 684         /**
 685          * @return String: interwiki prefix of the title (return iw even if title is broken)
 686          */
 687         function getInterwikiPrefix(){
 688                 return '';
 689         }
 690 }
 691
 692 /**
 693  * Highlight bits of wikitext
 694  *
 695  * @ingroup Search
 696  */
 697 class SearchHighlighter {
 698         var $mCleanWikitext = true;
 699
 700         function SearchHighlighter($cleanupWikitext = true){
 701                 $this->mCleanWikitext = $cleanupWikitext;
 702         }
 703
 704         /**
 705          * Default implementation of wikitext highlighting
 706          *
 707          * @param $text String
 708          * @param $terms Array: terms to highlight (unescaped)
 709          * @param $contextlines Integer
 710          * @param $contextchars Integer
 711          * @return String
 712          */
 713         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 714                 global $wgLang, $wgContLang;
 715                 global $wgSearchHighlightBoundaries;
 716                 $fname = __METHOD__;
 717
 718                 if($text == '')
 719                         return '';
 720
 721                 // spli text into text + templates/links/tables
 722                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 723                 // first capture group is for detecting nested templates/links/tables/references
 724                 $endPatterns = array(
 725                         1 => '/(\{\{)|(\}\})/', // template
 726                         2 => '/(\[\[)|(\]\])/', // image
 727                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 728
 729                 // FIXME: this should prolly be a hook or something
 730                 if(function_exists('wfCite')){
 731                         $spat .= '|(<ref>)'; // references via cite extension
 732                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 733                 }
 734                 $spat .= '/';
 735                 $textExt = array(); // text extracts
 736                 $otherExt = array();  // other extracts
 737                 wfProfileIn( "$fname-split" );
 738                 $start = 0;
 739                 $textLen = strlen($text);
 740                 $count = 0; // sequence number to maintain ordering
 741                 while( $start < $textLen ){
 742                         // find start of template/image/table
 743                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 744                                 $epat = '';
 745                                 foreach($matches as $key => $val){
 746                                         if($key > 0 && $val[1] != -1){
 747                                                 if($key == 2){
 748                                                         // see if this is an image link
 749                                                         $ns = substr($val[0],2,-1);
 750                                                         if( $wgContLang->getNsIndex($ns) != NS_FILE )
 751                                                                 break;
 752
 753                                                 }
 754                                                 $epat = $endPatterns[$key];
 755                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 756                                                 $start = $val[1];
 757                                                 break;
 758                                         }
 759                                 }
 760                                 if( $epat ){
 761                                         // find end (and detect any nested elements)
 762                                         $level = 0;
 763                                         $offset = $start + 1;
 764                                         $found = false;
 765                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 766                                                 if( array_key_exists(2,$endMatches) ){
 767                                                         // found end
 768                                                         if($level == 0){
 769                                                                 $len = strlen($endMatches[2][0]);
 770                                                                 $off = $endMatches[2][1];
 771                                                                 $this->splitAndAdd( $otherExt, $count,
 772                                                                         substr( $text, $start, $off + $len  - $start ) );
 773                                                                 $start = $off + $len;
 774                                                                 $found = true;
 775                                                                 break;
 776                                                         } else{
 777                                                                 // end of nested element
 778                                                                 $level -= 1;
 779                                                         }
 780                                                 } else{
 781                                                         // nested
 782                                                         $level += 1;
 783                                                 }
 784                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 785                                         }
 786                                         if( ! $found ){
 787                                                 // couldn't find appropriate closing tag, skip
 788                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 789                                                 $start += strlen($matches[0][0]);
 790                                         }
 791                                         continue;
 792                                 }
 793                         }
 794                         // else: add as text extract
 795                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 796                         break;
 797                 }
 798
 799                 $all = $textExt + $otherExt; // these have disjunct key sets
 800
 801                 wfProfileOut( "$fname-split" );
 802
 803                 // prepare regexps
 804                 foreach( $terms as $index => $term ) {
 805                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 806                         if(preg_match('/[\x80-\xff]/', $term) ){
 807                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 808                         } else {
 809                                 $terms[$index] = $term;
 810                         }
 811                 }
 812                 $anyterm = implode( '|', $terms );
 813                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 814
 815                 // FIXME: a hack to scale contextchars, a correct solution
 816                 // would be to have contextchars actually be char and not byte
 817                 // length, and do proper utf-8 substrings and lengths everywhere,
 818                 // but PHP is making that very hard and unclean to implement :(
 819                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 820                 $contextchars = intval( $contextchars * $scale );
 821
 822                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 823                 $patPost = "($wgSearchHighlightBoundaries|$)";
 824
 825                 $pat1 = "/(".$phrase.")/ui";
 826                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 827
 828                 wfProfileIn( "$fname-extract" );
 829
 830                 $left = $contextlines;
 831
 832                 $snippets = array();
 833                 $offsets = array();
 834
 835                 // show beginning only if it contains all words
 836                 $first = 0;
 837                 $firstText = '';
 838                 foreach($textExt as $index => $line){
 839                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 840                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 841                                 $first = $index;
 842                                 break;
 843                         }
 844                 }
 845                 if( $firstText ){
 846                         $succ = true;
 847                         // check if first text contains all terms
 848                         foreach($terms as $term){
 849                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 850                                         $succ = false;
 851                                         break;
 852                                 }
 853                         }
 854                         if( $succ ){
 855                                 $snippets[$first] = $firstText;
 856                                 $offsets[$first] = 0;
 857                         }
 858                 }
 859                 if( ! $snippets ) {
 860                         // match whole query on text
 861                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 862                         // match whole query on templates/tables/images
 863                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 864                         // match any words on text
 865                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 866                         // match any words on templates/tables/images
 867                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 868
 869                         ksort($snippets);
 870                 }
 871
 872                 // add extra chars to each snippet to make snippets constant size
 873                 $extended = array();
 874                 if( count( $snippets ) == 0){
 875                         // couldn't find the target words, just show beginning of article
 876                         $targetchars = $contextchars * $contextlines;
 877                         $snippets[$first] = '';
 878                         $offsets[$first] = 0;
 879                 } else{
 880                         // if begin of the article contains the whole phrase, show only that !!
 881                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 882                             && $offsets[$first] < $contextchars * 2 ){
 883                                 $snippets = array ($first => $snippets[$first]);
 884                         }
 885
 886                         // calc by how much to extend existing snippets
 887                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 888                 }
 889
 890                 foreach($snippets as $index => $line){
 891                         $extended[$index] = $line;
 892                         $len = strlen($line);
 893                         if( $len < $targetchars - 20 ){
 894                                 // complete this line
 895                                 if($len < strlen( $all[$index] )){
 896                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 897                                         $len = strlen( $extended[$index] );
 898                                 }
 899
 900                                 // add more lines
 901                                 $add = $index + 1;
 902                                 while( $len < $targetchars - 20
 903                                        && array_key_exists($add,$all)
 904                                        && !array_key_exists($add,$snippets) ){
 905                                     $offsets[$add] = 0;
 906                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 907                                         $extended[$add] = $tt;
 908                                         $len += strlen( $tt );
 909                                         $add++;
 910                                 }
 911                         }
 912                 }
 913
 914                 //$snippets = array_map('htmlspecialchars', $extended);
 915                 $snippets = $extended;
 916                 $last = -1;
 917                 $extract = '';
 918                 foreach($snippets as $index => $line){
 919                         if($last == -1)
 920                                 $extract .= $line; // first line
 921                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 922                                 $extract .= " ".$line; // continous lines
 923                         else
 924                                 $extract .= '<b> ... </b>' . $line;
 925
 926                         $last = $index;
 927                 }
 928                 if( $extract )
 929                         $extract .= '<b> ... </b>';
 930
 931                 $processed = array();
 932                 foreach($terms as $term){
 933                         if( ! isset($processed[$term]) ){
 934                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 935                                 $extract = preg_replace( $pat3,
 936                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 937                                 $processed[$term] = true;
 938                         }
 939                 }
 940
 941                 wfProfileOut( "$fname-extract" );
 942
 943                 return $extract;
 944         }
 945
 946         /**
 947          * Split text into lines and add it to extracts array
 948          *
 949          * @param $extracts Array: index -> $line
 950          * @param $count Integer
 951          * @param $text String
 952          */
 953         function splitAndAdd(&$extracts, &$count, $text){
 954                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 955                 foreach($split as $line){
 956                         $tt = trim($line);
 957                         if( $tt )
 958                                 $extracts[$count++] = $tt;
 959                 }
 960         }
 961
 962         /**
 963          * Do manual case conversion for non-ascii chars
 964          *
 965          * @param $matches Array
 966          */
 967         function caseCallback($matches){
 968                 global $wgContLang;
 969                 if( strlen($matches[0]) > 1 ){
 970                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
 971                 } else
 972                         return $matches[0];
 973         }
 974
 975         /**
 976          * Extract part of the text from start to end, but by
 977          * not chopping up words
 978          * @param $text String
 979          * @param $start Integer
 980          * @param $end Integer
 981          * @param $posStart Integer: (out) actual start position
 982          * @param $posEnd Integer: (out) actual end position
 983          * @return String
 984          */
 985         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
 986                 global $wgContLang;
 987
 988                 if( $start != 0)
 989                         $start = $this->position( $text, $start, 1 );
 990                 if( $end >= strlen($text) )
 991                         $end = strlen($text);
 992                 else
 993                         $end = $this->position( $text, $end );
 994
 995                 if(!is_null($posStart))
 996                         $posStart = $start;
 997                 if(!is_null($posEnd))
 998                         $posEnd = $end;
 999
1000                 if($end > $start)
1001                         return substr($text, $start, $end-$start);
1002                 else
1003                         return '';
1004         }
1005
1006         /**
1007          * Find a nonletter near a point (index) in the text
1008          *
1009          * @param $text String
1010          * @param $point Integer
1011          * @param $offset Integer: offset to found index
1012          * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1013          */
1014         function position($text, $point, $offset=0 ){
1015                 $tolerance = 10;
1016                 $s = max( 0, $point - $tolerance );
1017                 $l = min( strlen($text), $point + $tolerance ) - $s;
1018                 $m = array();
1019                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1020                         return $m[0][1] + $s + $offset;
1021                 } else{
1022                         // check if point is on a valid first UTF8 char
1023                         $char = ord( $text[$point] );
1024                         while( $char >= 0x80 && $char < 0xc0 ) {
1025                                 // skip trailing bytes
1026                                 $point++;
1027                                 if($point >= strlen($text))
1028                                         return strlen($text);
1029                                 $char = ord( $text[$point] );
1030                         }
1031                         return $point;
1032
1033                 }
1034         }
1035
1036         /**
1037          * Search extracts for a pattern, and return snippets
1038          *
1039          * @param $pattern String: regexp for matching lines
1040          * @param $extracts Array: extracts to search
1041          * @param $linesleft Integer: number of extracts to make
1042          * @param $contextchars Integer: length of snippet
1043          * @param $out Array: map for highlighted snippets
1044          * @param $offsets Array: map of starting points of snippets
1045          * @protected
1046          */
1047         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1048                 if($linesleft == 0)
1049                         return; // nothing to do
1050                 foreach($extracts as $index => $line){
1051                         if( array_key_exists($index,$out) )
1052                                 continue; // this line already highlighted
1053
1054                         $m = array();
1055                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1056                                 continue;
1057
1058                         $offset = $m[0][1];
1059                         $len = strlen($m[0][0]);
1060                         if($offset + $len < $contextchars)
1061                                 $begin = 0;
1062                         elseif( $len > $contextchars)
1063                                 $begin = $offset;
1064                         else
1065                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1066
1067                         $end = $begin + $contextchars;
1068
1069                         $posBegin = $begin;
1070                         // basic snippet from this line
1071                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1072                         $offsets[$index] = $posBegin;
1073                         $linesleft--;
1074                         if($linesleft == 0)
1075                                 return;
1076                 }
1077         }
1078
1079         /**
1080          * Basic wikitext removal
1081          * @protected
1082          */
1083         function removeWiki($text) {
1084                 $fname = __METHOD__;
1085                 wfProfileIn( $fname );
1086
1087                 //$text = preg_replace("/'{2,5}/", "", $text);
1088                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1089                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1090                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1091                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1092                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1093                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1094                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1095                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1096                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1097                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1098                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1099                 $text = preg_replace("/'''''/", "", $text);
1100                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1101                 $text = preg_replace("/''/", "", $text);
1102
1103                 wfProfileOut( $fname );
1104                 return $text;
1105         }
1106
1107         /**
1108          * callback to replace [[target|caption]] kind of links, if
1109          * the target is category or image, leave it
1110          *
1111          * @param $matches Array
1112          */
1113         function linkReplace($matches){
1114                 $colon = strpos( $matches[1], ':' );
1115                 if( $colon === false )
1116                         return $matches[2]; // replace with caption
1117                 global $wgContLang;
1118                 $ns = substr( $matches[1], 0, $colon );
1119                 $index = $wgContLang->getNsIndex($ns);
1120                 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1121                         return $matches[0]; // return the whole thing
1122                 else
1123                         return $matches[2];
1124
1125         }
1126
1127         /**
1128      * Simple & fast snippet extraction, but gives completely unrelevant
1129      * snippets
1130      *
1131      * @param $text String
1132      * @param $terms Array
1133      * @param $contextlines Integer
1134      * @param $contextchars Integer
1135      * @return String
1136      */
1137     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1138         global $wgLang, $wgContLang;
1139         $fname = __METHOD__;
1140
1141         $lines = explode( "\n", $text );
1142
1143         $terms = implode( '|', $terms );
1144         $max = intval( $contextchars ) + 1;
1145         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1146
1147         $lineno = 0;
1148
1149         $extract = "";
1150         wfProfileIn( "$fname-extract" );
1151         foreach ( $lines as $line ) {
1152             if ( 0 == $contextlines ) {
1153                 break;
1154             }
1155             ++$lineno;
1156             $m = array();
1157             if ( ! preg_match( $pat1, $line, $m ) ) {
1158                 continue;
1159             }
1160             --$contextlines;
1161             $pre = $wgContLang->truncate( $m[1], -$contextchars );
1162
1163             if ( count( $m ) < 3 ) {
1164                 $post = '';
1165             } else {
1166                 $post = $wgContLang->truncate( $m[3], $contextchars );
1167             }
1168
1169             $found = $m[2];
1170
1171             $line = htmlspecialchars( $pre . $found . $post );
1172             $pat2 = '/(' . $terms . ")/i";
1173             $line = preg_replace( $pat2,
1174               "<span class='searchmatch'>\\1</span>", $line );
1175
1176             $extract .= "${line}\n";
1177         }
1178         wfProfileOut( "$fname-extract" );
1179
1180         return $extract;
1181     }
1182
1183 }
1184
1185 /**
1186  * Dummy class to be used when non-supported Database engine is present.
1187  * @todo Fixme: dummy class should probably try something at least mildly useful,
1188  * such as a LIKE search through titles.
1189  * @ingroup Search
1190  */
1191 class SearchEngineDummy extends SearchEngine {
1192         // no-op
1193 }