includes/search/SearchEngine.php

   1 <?php
   2 /**
   3  * @defgroup Search Search
   4  *
   5  * @file
   6  * @ingroup Search
   7  */
   8
   9 /**
  10  * Contain a class for special pages
  11  * @ingroup Search
  12  */
  13 class SearchEngine {
  14         var $limit = 10;
  15         var $offset = 0;
  16         var $prefix = '';
  17         var $searchTerms = array();
  18         var $namespaces = array( NS_MAIN );
  19         var $showRedirects = false;
  20
  21         /**
  22          * Perform a full text search query and return a result set.
  23          * If title searches are not supported or disabled, return null.
  24          * STUB
  25          *
  26          * @param $term String: raw search term
  27          * @return SearchResultSet
  28          */
  29         function searchText( $term ) {
  30                 return null;
  31         }
  32
  33         /**
  34          * Perform a title-only search query and return a result set.
  35          * If title searches are not supported or disabled, return null.
  36          * STUB
  37          *
  38          * @param $term String: raw search term
  39          * @return SearchResultSet
  40          */
  41         function searchTitle( $term ) {
  42                 return null;
  43         }
  44
  45         /** If this search backend can list/unlist redirects */
  46         function acceptListRedirects() {
  47                 return true;
  48         }
  49
  50         /**
  51          * Transform search term in cases when parts of the query came as different GET params (when supported)
  52          * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
  53          */
  54         function transformSearchTerm( $term ) {
  55                 return $term;
  56         }
  57
  58         /**
  59          * If an exact title match can be found, or a very slightly close match,
  60          * return the title. If no match, returns NULL.
  61          *
  62          * @param $searchterm String
  63          * @return Title
  64          */
  65         public static function getNearMatch( $searchterm ) {
  66                 $title = self::getNearMatchInternal( $searchterm );
  67
  68                 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
  69                 return $title;
  70         }
  71
  72         /**
  73          * Really find the title match.
  74          */
  75         private static function getNearMatchInternal( $searchterm ) {
  76                 global $wgContLang;
  77
  78                 $allSearchTerms = array($searchterm);
  79
  80                 if ( $wgContLang->hasVariants() ) {
  81                         $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  82                 }
  83
  84                 if( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
  85                         return $titleResult;
  86                 }
  87
  88                 foreach($allSearchTerms as $term) {
  89
  90                         # Exact match? No need to look further.
  91                         $title = Title::newFromText( $term );
  92                         if (is_null($title))
  93                                 return null;
  94
  95                         if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
  96                                 return $title;
  97                         }
  98
  99                         # See if it still otherwise has content is some sane sense
 100                         $article = MediaWiki::articleFromTitle( $title );
 101                         if( $article->hasViewableContent() ) {
 102                                 return $title;
 103                         }
 104
 105                         # Now try all lower case (i.e. first letter capitalized)
 106                         #
 107                         $title = Title::newFromText( $wgContLang->lc( $term ) );
 108                         if ( $title && $title->exists() ) {
 109                                 return $title;
 110                         }
 111
 112                         # Now try capitalized string
 113                         #
 114                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
 115                         if ( $title && $title->exists() ) {
 116                                 return $title;
 117                         }
 118
 119                         # Now try all upper case
 120                         #
 121                         $title = Title::newFromText( $wgContLang->uc( $term ) );
 122                         if ( $title && $title->exists() ) {
 123                                 return $title;
 124                         }
 125
 126                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
 127                         $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
 128                         if ( $title && $title->exists() ) {
 129                                 return $title;
 130                         }
 131
 132                         // Give hooks a chance at better match variants
 133                         $title = null;
 134                         if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
 135                                 return $title;
 136                         }
 137                 }
 138
 139                 $title = Title::newFromText( $searchterm );
 140
 141                 # Entering an IP address goes to the contributions page
 142                 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
 143                         || User::isIP( trim( $searchterm ) ) ) {
 144                         return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
 145                 }
 146
 147
 148                 # Entering a user goes to the user page whether it's there or not
 149                 if ( $title->getNamespace() == NS_USER ) {
 150                         return $title;
 151                 }
 152
 153                 # Go to images that exist even if there's no local page.
 154                 # There may have been a funny upload, or it may be on a shared
 155                 # file repository such as Wikimedia Commons.
 156                 if( $title->getNamespace() == NS_FILE ) {
 157                         $image = wfFindFile( $title );
 158                         if( $image ) {
 159                                 return $title;
 160                         }
 161                 }
 162
 163                 # MediaWiki namespace? Page may be "implied" if not customized.
 164                 # Just return it, with caps forced as the message system likes it.
 165                 if( $title->getNamespace() == NS_MEDIAWIKI ) {
 166                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
 167                 }
 168
 169                 # Quoted term? Try without the quotes...
 170                 $matches = array();
 171                 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
 172                         return SearchEngine::getNearMatch( $matches[1] );
 173                 }
 174
 175                 return null;
 176         }
 177
 178         public static function legalSearchChars() {
 179                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
 180         }
 181
 182         /**
 183          * Set the maximum number of results to return
 184          * and how many to skip before returning the first.
 185          *
 186          * @param $limit Integer
 187          * @param $offset Integer
 188          */
 189         function setLimitOffset( $limit, $offset = 0 ) {
 190                 $this->limit = intval( $limit );
 191                 $this->offset = intval( $offset );
 192         }
 193
 194         /**
 195          * Set which namespaces the search should include.
 196          * Give an array of namespace index numbers.
 197          *
 198          * @param $namespaces Array
 199          */
 200         function setNamespaces( $namespaces ) {
 201                 $this->namespaces = $namespaces;
 202         }
 203
 204         /**
 205          * Parse some common prefixes: all (search everything)
 206          * or namespace names
 207          *
 208          * @param $query String
 209          */
 210         function replacePrefixes( $query ){
 211                 global $wgContLang;
 212
 213                 $parsed = $query;
 214                 if( strpos($query,':') === false ) { // nothing to do
 215                         wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
 216                         return $parsed;
 217                 }
 218
 219                 $allkeyword = wfMsgForContent('searchall').":";
 220                 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
 221                         $this->namespaces = null;
 222                         $parsed = substr($query,strlen($allkeyword));
 223                 } else if( strpos($query,':') !== false ) {
 224                         $prefix = substr($query,0,strpos($query,':'));
 225                         $index = $wgContLang->getNsIndex($prefix);
 226                         if($index !== false){
 227                                 $this->namespaces = array($index);
 228                                 $parsed = substr($query,strlen($prefix)+1);
 229                         }
 230                 }
 231                 if(trim($parsed) == '')
 232                         $parsed = $query; // prefix was the whole query
 233
 234                 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
 235
 236                 return $parsed;
 237         }
 238
 239         /**
 240          * Make a list of searchable namespaces and their canonical names.
 241          * @return Array
 242          */
 243         public static function searchableNamespaces() {
 244                 global $wgContLang;
 245                 $arr = array();
 246                 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
 247                         if( $ns >= NS_MAIN ) {
 248                                 $arr[$ns] = $name;
 249                         }
 250                 }
 251
 252                 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
 253                 return $arr;
 254         }
 255
 256         /**
 257          * Extract default namespaces to search from the given user's
 258          * settings, returning a list of index numbers.
 259          *
 260          * @param $user User
 261          * @return Array
 262          */
 263         public static function userNamespaces( $user ) {
 264                 global $wgSearchEverythingOnlyLoggedIn;
 265
 266                 // get search everything preference, that can be set to be read for logged-in users
 267                 $searcheverything = false;
 268                 if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
 269                     || !$wgSearchEverythingOnlyLoggedIn )
 270                         $searcheverything = $user->getOption('searcheverything');
 271
 272                 // searcheverything overrides other options
 273                 if( $searcheverything )
 274                         return array_keys(SearchEngine::searchableNamespaces());
 275
 276                 $arr = Preferences::loadOldSearchNs( $user );
 277                 $searchableNamespaces = SearchEngine::searchableNamespaces();
 278
 279                 $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter
 280
 281                 return $arr;
 282         }
 283
 284         /**
 285          * Find snippet highlight settings for a given user
 286          *
 287          * @param $user User
 288          * @return Array contextlines, contextchars
 289          */
 290         public static function userHighlightPrefs( &$user ){
 291                 //$contextlines = $user->getOption( 'contextlines',  5 );
 292                 //$contextchars = $user->getOption( 'contextchars', 50 );
 293                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
 294                 $contextchars = 75; // same as above.... :P
 295                 return array($contextlines, $contextchars);
 296         }
 297
 298         /**
 299          * An array of namespaces indexes to be searched by default
 300          *
 301          * @return Array
 302          */
 303         public static function defaultNamespaces(){
 304                 global $wgNamespacesToBeSearchedDefault;
 305
 306                 return array_keys($wgNamespacesToBeSearchedDefault, true);
 307         }
 308
 309         /**
 310          * Get a list of namespace names useful for showing in tooltips
 311          * and preferences
 312          *
 313          * @param $namespaces Array
 314          */
 315         public static function namespacesAsText( $namespaces ){
 316                 global $wgContLang;
 317
 318                 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
 319                 foreach( $formatted as $key => $ns ){
 320                         if ( empty($ns) )
 321                                 $formatted[$key] = wfMsg( 'blanknamespace' );
 322                 }
 323                 return $formatted;
 324         }
 325
 326         /**
 327          * Return the help namespaces to be shown on Special:Search
 328          *
 329          * @return Array
 330          */
 331         public static function helpNamespaces() {
 332                 global $wgNamespacesToBeSearchedHelp;
 333
 334                 return array_keys( $wgNamespacesToBeSearchedHelp, true );
 335         }
 336
 337         /**
 338          * Return a 'cleaned up' search string
 339          *
 340          * @param $text String
 341          * @return String
 342          */
 343         function filter( $text ) {
 344                 $lc = $this->legalSearchChars();
 345                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
 346         }
 347         /**
 348          * Load up the appropriate search engine class for the currently
 349          * active database backend, and return a configured instance.
 350          *
 351          * @return SearchEngine
 352          */
 353         public static function create() {
 354                 global $wgSearchType;
 355                 $dbr = wfGetDB( DB_SLAVE );
 356                 if( $wgSearchType ) {
 357                         $class = $wgSearchType;
 358                 } else {
 359                         $class = $dbr->getSearchEngine();
 360                 }
 361                 $search = new $class( $dbr );
 362                 $search->setLimitOffset(0,0);
 363                 return $search;
 364         }
 365
 366         /**
 367          * Create or update the search index record for the given page.
 368          * Title and text should be pre-processed.
 369          * STUB
 370          *
 371          * @param $id Integer
 372          * @param $title String
 373          * @param $text String
 374          */
 375         function update( $id, $title, $text ) {
 376                 // no-op
 377         }
 378
 379         /**
 380          * Update a search index record's title only.
 381          * Title should be pre-processed.
 382          * STUB
 383          *
 384          * @param $id Integer
 385          * @param $title String
 386          */
 387         function updateTitle( $id, $title ) {
 388                 // no-op
 389         }
 390
 391         /**
 392          * Get OpenSearch suggestion template
 393          *
 394          * @return String
 395          */
 396         public static function getOpenSearchTemplate() {
 397                 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
 398                 if( $wgOpenSearchTemplate )     {
 399                         return $wgOpenSearchTemplate;
 400                 } else {
 401                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
 402                         if( !$ns ) $ns = "0";
 403                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
 404                 }
 405         }
 406
 407         /**
 408          * Get internal MediaWiki Suggest template
 409          *
 410          * @return String
 411          */
 412         public static function getMWSuggestTemplate() {
 413                 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
 414                 if($wgMWSuggestTemplate)
 415                         return $wgMWSuggestTemplate;
 416                 else
 417                         return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
 418         }
 419 }
 420
 421 /**
 422  * @ingroup Search
 423  */
 424 class SearchResultSet {
 425         /**
 426          * Fetch an array of regular expression fragments for matching
 427          * the search terms as parsed by this engine in a text extract.
 428          * STUB
 429          *
 430          * @return Array
 431          */
 432         function termMatches() {
 433                 return array();
 434         }
 435
 436         function numRows() {
 437                 return 0;
 438         }
 439
 440         /**
 441          * Return true if results are included in this result set.
 442          * STUB
 443          *
 444          * @return Boolean
 445          */
 446         function hasResults() {
 447                 return false;
 448         }
 449
 450         /**
 451          * Some search modes return a total hit count for the query
 452          * in the entire article database. This may include pages
 453          * in namespaces that would not be matched on the given
 454          * settings.
 455          *
 456          * Return null if no total hits number is supported.
 457          *
 458          * @return Integer
 459          */
 460         function getTotalHits() {
 461                 return null;
 462         }
 463
 464         /**
 465          * Some search modes return a suggested alternate term if there are
 466          * no exact hits. Returns true if there is one on this set.
 467          *
 468          * @return Boolean
 469          */
 470         function hasSuggestion() {
 471                 return false;
 472         }
 473
 474         /**
 475          * @return String: suggested query, null if none
 476          */
 477         function getSuggestionQuery(){
 478                 return null;
 479         }
 480
 481         /**
 482          * @return String: HTML highlighted suggested query, '' if none
 483          */
 484         function getSuggestionSnippet(){
 485                 return '';
 486         }
 487
 488         /**
 489          * Return information about how and from where the results were fetched,
 490          * should be useful for diagnostics and debugging
 491          *
 492          * @return String
 493          */
 494         function getInfo() {
 495                 return null;
 496         }
 497
 498         /**
 499          * Return a result set of hits on other (multiple) wikis associated with this one
 500          *
 501          * @return SearchResultSet
 502          */
 503         function getInterwikiResults() {
 504                 return null;
 505         }
 506
 507         /**
 508          * Check if there are results on other wikis
 509          *
 510          * @return Boolean
 511          */
 512         function hasInterwikiResults() {
 513                 return $this->getInterwikiResults() != null;
 514         }
 515
 516
 517         /**
 518          * Fetches next search result, or false.
 519          * STUB
 520          *
 521          * @return SearchResult
 522          */
 523         function next() {
 524                 return false;
 525         }
 526
 527         /**
 528          * Frees the result set, if applicable.
 529          */
 530         function free() {
 531                 // ...
 532         }
 533 }
 534
 535 /**
 536  * This class is used for different SQL-based search engines shipped with MediaWiki
 537  */
 538 class SqlSearchResultSet extends SearchResultSet {
 539         function __construct( $resultSet, $terms ) {
 540                 $this->mResultSet = $resultSet;
 541                 $this->mTerms = $terms;
 542         }
 543
 544         function termMatches() {
 545                 return $this->mTerms;
 546         }
 547
 548         function numRows() {
 549                 return $this->mResultSet->numRows();
 550         }
 551
 552         function next() {
 553                 if ($this->mResultSet === false )
 554                         return false;
 555
 556                 $row = $this->mResultSet->fetchObject();
 557                 if ($row === false)
 558                         return false;
 559                 return new SearchResult($row);
 560         }
 561
 562         function free() {
 563                 $this->mResultSet->free();
 564         }
 565 }
 566
 567 /**
 568  * @ingroup Search
 569  */
 570 class SearchResultTooMany {
 571         ## Some search engines may bail out if too many matches are found
 572 }
 573
 574
 575 /**
 576  * @todo Fixme: This class is horribly factored. It would probably be better to
 577  * have a useful base class to which you pass some standard information, then
 578  * let the fancy self-highlighters extend that.
 579  * @ingroup Search
 580  */
 581 class SearchResult {
 582         var $mRevision = null;
 583         var $mImage = null;
 584
 585         function __construct( $row ) {
 586                 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
 587                 if( !is_null($this->mTitle) ){
 588                         $this->mRevision = Revision::newFromTitle( $this->mTitle );
 589                         if( $this->mTitle->getNamespace() === NS_FILE )
 590                                 $this->mImage = wfFindFile( $this->mTitle );
 591                 }
 592         }
 593
 594         /**
 595          * Check if this is result points to an invalid title
 596          *
 597          * @return Boolean
 598          */
 599         function isBrokenTitle(){
 600                 if( is_null($this->mTitle) )
 601                         return true;
 602                 return false;
 603         }
 604
 605         /**
 606          * Check if target page is missing, happens when index is out of date
 607          *
 608          * @return Boolean
 609          */
 610         function isMissingRevision(){
 611                 return !$this->mRevision && !$this->mImage;
 612         }
 613
 614         /**
 615          * @return Title
 616          */
 617         function getTitle() {
 618                 return $this->mTitle;
 619         }
 620
 621         /**
 622          * @return Double or null if not supported
 623          */
 624         function getScore() {
 625                 return null;
 626         }
 627
 628         /**
 629          * Lazy initialization of article text from DB
 630          */
 631         protected function initText(){
 632                 if( !isset($this->mText) ){
 633                         if($this->mRevision != null)
 634                                 $this->mText = $this->mRevision->getText();
 635                         else // TODO: can we fetch raw wikitext for commons images?
 636                                 $this->mText = '';
 637
 638                 }
 639         }
 640
 641         /**
 642          * @param $terms Array: terms to highlight
 643          * @return String: highlighted text snippet, null (and not '') if not supported
 644          */
 645         function getTextSnippet($terms){
 646                 global $wgUser, $wgAdvancedSearchHighlighting;
 647                 $this->initText();
 648                 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
 649                 $h = new SearchHighlighter();
 650                 if( $wgAdvancedSearchHighlighting )
 651                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
 652                 else
 653                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
 654         }
 655
 656         /**
 657          * @param $terms Array: terms to highlight
 658          * @return String: highlighted title, '' if not supported
 659          */
 660         function getTitleSnippet($terms){
 661                 return '';
 662         }
 663
 664         /**
 665          * @param $terms Array: terms to highlight
 666          * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
 667          */
 668         function getRedirectSnippet($terms){
 669                 return '';
 670         }
 671
 672         /**
 673          * @return Title object for the redirect to this page, null if none or not supported
 674          */
 675         function getRedirectTitle(){
 676                 return null;
 677         }
 678
 679         /**
 680          * @return string highlighted relevant section name, null if none or not supported
 681          */
 682         function getSectionSnippet(){
 683                 return '';
 684         }
 685
 686         /**
 687          * @return Title object (pagename+fragment) for the section, null if none or not supported
 688          */
 689         function getSectionTitle(){
 690                 return null;
 691         }
 692
 693         /**
 694          * @return String: timestamp
 695          */
 696         function getTimestamp(){
 697                 if( $this->mRevision )
 698                         return $this->mRevision->getTimestamp();
 699                 else if( $this->mImage )
 700                         return $this->mImage->getTimestamp();
 701                 return '';
 702         }
 703
 704         /**
 705          * @return Integer: number of words
 706          */
 707         function getWordCount(){
 708                 $this->initText();
 709                 return str_word_count( $this->mText );
 710         }
 711
 712         /**
 713          * @return Integer: size in bytes
 714          */
 715         function getByteSize(){
 716                 $this->initText();
 717                 return strlen( $this->mText );
 718         }
 719
 720         /**
 721          * @return Boolean if hit has related articles
 722          */
 723         function hasRelated(){
 724                 return false;
 725         }
 726
 727         /**
 728          * @return String: interwiki prefix of the title (return iw even if title is broken)
 729          */
 730         function getInterwikiPrefix(){
 731                 return '';
 732         }
 733 }
 734
 735 /**
 736  * Highlight bits of wikitext
 737  *
 738  * @ingroup Search
 739  */
 740 class SearchHighlighter {
 741         var $mCleanWikitext = true;
 742
 743         function SearchHighlighter($cleanupWikitext = true){
 744                 $this->mCleanWikitext = $cleanupWikitext;
 745         }
 746
 747         /**
 748          * Default implementation of wikitext highlighting
 749          *
 750          * @param $text String
 751          * @param $terms Array: terms to highlight (unescaped)
 752          * @param $contextlines Integer
 753          * @param $contextchars Integer
 754          * @return String
 755          */
 756         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
 757                 global $wgLang, $wgContLang;
 758                 global $wgSearchHighlightBoundaries;
 759                 $fname = __METHOD__;
 760
 761                 if($text == '')
 762                         return '';
 763
 764                 // spli text into text + templates/links/tables
 765                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
 766                 // first capture group is for detecting nested templates/links/tables/references
 767                 $endPatterns = array(
 768                         1 => '/(\{\{)|(\}\})/', // template
 769                         2 => '/(\[\[)|(\]\])/', // image
 770                         3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
 771
 772                 // FIXME: this should prolly be a hook or something
 773                 if(function_exists('wfCite')){
 774                         $spat .= '|(<ref>)'; // references via cite extension
 775                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
 776                 }
 777                 $spat .= '/';
 778                 $textExt = array(); // text extracts
 779                 $otherExt = array();  // other extracts
 780                 wfProfileIn( "$fname-split" );
 781                 $start = 0;
 782                 $textLen = strlen($text);
 783                 $count = 0; // sequence number to maintain ordering
 784                 while( $start < $textLen ){
 785                         // find start of template/image/table
 786                         if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
 787                                 $epat = '';
 788                                 foreach($matches as $key => $val){
 789                                         if($key > 0 && $val[1] != -1){
 790                                                 if($key == 2){
 791                                                         // see if this is an image link
 792                                                         $ns = substr($val[0],2,-1);
 793                                                         if( $wgContLang->getNsIndex($ns) != NS_FILE )
 794                                                                 break;
 795
 796                                                 }
 797                                                 $epat = $endPatterns[$key];
 798                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
 799                                                 $start = $val[1];
 800                                                 break;
 801                                         }
 802                                 }
 803                                 if( $epat ){
 804                                         // find end (and detect any nested elements)
 805                                         $level = 0;
 806                                         $offset = $start + 1;
 807                                         $found = false;
 808                                         while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
 809                                                 if( array_key_exists(2,$endMatches) ){
 810                                                         // found end
 811                                                         if($level == 0){
 812                                                                 $len = strlen($endMatches[2][0]);
 813                                                                 $off = $endMatches[2][1];
 814                                                                 $this->splitAndAdd( $otherExt, $count,
 815                                                                         substr( $text, $start, $off + $len  - $start ) );
 816                                                                 $start = $off + $len;
 817                                                                 $found = true;
 818                                                                 break;
 819                                                         } else{
 820                                                                 // end of nested element
 821                                                                 $level -= 1;
 822                                                         }
 823                                                 } else{
 824                                                         // nested
 825                                                         $level += 1;
 826                                                 }
 827                                                 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
 828                                         }
 829                                         if( ! $found ){
 830                                                 // couldn't find appropriate closing tag, skip
 831                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
 832                                                 $start += strlen($matches[0][0]);
 833                                         }
 834                                         continue;
 835                                 }
 836                         }
 837                         // else: add as text extract
 838                         $this->splitAndAdd( $textExt, $count, substr($text,$start) );
 839                         break;
 840                 }
 841
 842                 $all = $textExt + $otherExt; // these have disjunct key sets
 843
 844                 wfProfileOut( "$fname-split" );
 845
 846                 // prepare regexps
 847                 foreach( $terms as $index => $term ) {
 848                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
 849                         if(preg_match('/[\x80-\xff]/', $term) ){
 850                                 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
 851                         } else {
 852                                 $terms[$index] = $term;
 853                         }
 854                 }
 855                 $anyterm = implode( '|', $terms );
 856                 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
 857
 858                 // FIXME: a hack to scale contextchars, a correct solution
 859                 // would be to have contextchars actually be char and not byte
 860                 // length, and do proper utf-8 substrings and lengths everywhere,
 861                 // but PHP is making that very hard and unclean to implement :(
 862                 $scale = strlen($anyterm) / mb_strlen($anyterm);
 863                 $contextchars = intval( $contextchars * $scale );
 864
 865                 $patPre = "(^|$wgSearchHighlightBoundaries)";
 866                 $patPost = "($wgSearchHighlightBoundaries|$)";
 867
 868                 $pat1 = "/(".$phrase.")/ui";
 869                 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
 870
 871                 wfProfileIn( "$fname-extract" );
 872
 873                 $left = $contextlines;
 874
 875                 $snippets = array();
 876                 $offsets = array();
 877
 878                 // show beginning only if it contains all words
 879                 $first = 0;
 880                 $firstText = '';
 881                 foreach($textExt as $index => $line){
 882                         if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
 883                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
 884                                 $first = $index;
 885                                 break;
 886                         }
 887                 }
 888                 if( $firstText ){
 889                         $succ = true;
 890                         // check if first text contains all terms
 891                         foreach($terms as $term){
 892                                 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
 893                                         $succ = false;
 894                                         break;
 895                                 }
 896                         }
 897                         if( $succ ){
 898                                 $snippets[$first] = $firstText;
 899                                 $offsets[$first] = 0;
 900                         }
 901                 }
 902                 if( ! $snippets ) {
 903                         // match whole query on text
 904                         $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
 905                         // match whole query on templates/tables/images
 906                         $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
 907                         // match any words on text
 908                         $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
 909                         // match any words on templates/tables/images
 910                         $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
 911
 912                         ksort($snippets);
 913                 }
 914
 915                 // add extra chars to each snippet to make snippets constant size
 916                 $extended = array();
 917                 if( count( $snippets ) == 0){
 918                         // couldn't find the target words, just show beginning of article
 919                         $targetchars = $contextchars * $contextlines;
 920                         $snippets[$first] = '';
 921                         $offsets[$first] = 0;
 922                 } else{
 923                         // if begin of the article contains the whole phrase, show only that !!
 924                         if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
 925                             && $offsets[$first] < $contextchars * 2 ){
 926                                 $snippets = array ($first => $snippets[$first]);
 927                         }
 928
 929                         // calc by how much to extend existing snippets
 930                         $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
 931                 }
 932
 933                 foreach($snippets as $index => $line){
 934                         $extended[$index] = $line;
 935                         $len = strlen($line);
 936                         if( $len < $targetchars - 20 ){
 937                                 // complete this line
 938                                 if($len < strlen( $all[$index] )){
 939                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
 940                                         $len = strlen( $extended[$index] );
 941                                 }
 942
 943                                 // add more lines
 944                                 $add = $index + 1;
 945                                 while( $len < $targetchars - 20
 946                                        && array_key_exists($add,$all)
 947                                        && !array_key_exists($add,$snippets) ){
 948                                     $offsets[$add] = 0;
 949                                     $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
 950                                         $extended[$add] = $tt;
 951                                         $len += strlen( $tt );
 952                                         $add++;
 953                                 }
 954                         }
 955                 }
 956
 957                 //$snippets = array_map('htmlspecialchars', $extended);
 958                 $snippets = $extended;
 959                 $last = -1;
 960                 $extract = '';
 961                 foreach($snippets as $index => $line){
 962                         if($last == -1)
 963                                 $extract .= $line; // first line
 964                         elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
 965                                 $extract .= " ".$line; // continous lines
 966                         else
 967                                 $extract .= '<b> ... </b>' . $line;
 968
 969                         $last = $index;
 970                 }
 971                 if( $extract )
 972                         $extract .= '<b> ... </b>';
 973
 974                 $processed = array();
 975                 foreach($terms as $term){
 976                         if( ! isset($processed[$term]) ){
 977                                 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
 978                                 $extract = preg_replace( $pat3,
 979                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
 980                                 $processed[$term] = true;
 981                         }
 982                 }
 983
 984                 wfProfileOut( "$fname-extract" );
 985
 986                 return $extract;
 987         }
 988
 989         /**
 990          * Split text into lines and add it to extracts array
 991          *
 992          * @param $extracts Array: index -> $line
 993          * @param $count Integer
 994          * @param $text String
 995          */
 996         function splitAndAdd(&$extracts, &$count, $text){
 997                 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
 998                 foreach($split as $line){
 999                         $tt = trim($line);
1000                         if( $tt )
1001                                 $extracts[$count++] = $tt;
1002                 }
1003         }
1004
1005         /**
1006          * Do manual case conversion for non-ascii chars
1007          *
1008          * @param $matches Array
1009          */
1010         function caseCallback($matches){
1011                 global $wgContLang;
1012                 if( strlen($matches[0]) > 1 ){
1013                         return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
1014                 } else
1015                         return $matches[0];
1016         }
1017
1018         /**
1019          * Extract part of the text from start to end, but by
1020          * not chopping up words
1021          * @param $text String
1022          * @param $start Integer
1023          * @param $end Integer
1024          * @param $posStart Integer: (out) actual start position
1025          * @param $posEnd Integer: (out) actual end position
1026          * @return String
1027          */
1028         function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
1029                 global $wgContLang;
1030
1031                 if( $start != 0)
1032                         $start = $this->position( $text, $start, 1 );
1033                 if( $end >= strlen($text) )
1034                         $end = strlen($text);
1035                 else
1036                         $end = $this->position( $text, $end );
1037
1038                 if(!is_null($posStart))
1039                         $posStart = $start;
1040                 if(!is_null($posEnd))
1041                         $posEnd = $end;
1042
1043                 if($end > $start)
1044                         return substr($text, $start, $end-$start);
1045                 else
1046                         return '';
1047         }
1048
1049         /**
1050          * Find a nonletter near a point (index) in the text
1051          *
1052          * @param $text String
1053          * @param $point Integer
1054          * @param $offset Integer: offset to found index
1055          * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1056          */
1057         function position($text, $point, $offset=0 ){
1058                 $tolerance = 10;
1059                 $s = max( 0, $point - $tolerance );
1060                 $l = min( strlen($text), $point + $tolerance ) - $s;
1061                 $m = array();
1062                 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1063                         return $m[0][1] + $s + $offset;
1064                 } else{
1065                         // check if point is on a valid first UTF8 char
1066                         $char = ord( $text[$point] );
1067                         while( $char >= 0x80 && $char < 0xc0 ) {
1068                                 // skip trailing bytes
1069                                 $point++;
1070                                 if($point >= strlen($text))
1071                                         return strlen($text);
1072                                 $char = ord( $text[$point] );
1073                         }
1074                         return $point;
1075
1076                 }
1077         }
1078
1079         /**
1080          * Search extracts for a pattern, and return snippets
1081          *
1082          * @param $pattern String: regexp for matching lines
1083          * @param $extracts Array: extracts to search
1084          * @param $linesleft Integer: number of extracts to make
1085          * @param $contextchars Integer: length of snippet
1086          * @param $out Array: map for highlighted snippets
1087          * @param $offsets Array: map of starting points of snippets
1088          * @protected
1089          */
1090         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1091                 if($linesleft == 0)
1092                         return; // nothing to do
1093                 foreach($extracts as $index => $line){
1094                         if( array_key_exists($index,$out) )
1095                                 continue; // this line already highlighted
1096
1097                         $m = array();
1098                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1099                                 continue;
1100
1101                         $offset = $m[0][1];
1102                         $len = strlen($m[0][0]);
1103                         if($offset + $len < $contextchars)
1104                                 $begin = 0;
1105                         elseif( $len > $contextchars)
1106                                 $begin = $offset;
1107                         else
1108                                 $begin = $offset + intval( ($len - $contextchars) / 2 );
1109
1110                         $end = $begin + $contextchars;
1111
1112                         $posBegin = $begin;
1113                         // basic snippet from this line
1114                         $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1115                         $offsets[$index] = $posBegin;
1116                         $linesleft--;
1117                         if($linesleft == 0)
1118                                 return;
1119                 }
1120         }
1121
1122         /**
1123          * Basic wikitext removal
1124          * @protected
1125          */
1126         function removeWiki($text) {
1127                 $fname = __METHOD__;
1128                 wfProfileIn( $fname );
1129
1130                 //$text = preg_replace("/'{2,5}/", "", $text);
1131                 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1132                 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1133                 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1134                 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1135                 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1136                 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1137                 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1138                 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1139                 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1140                 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1141                 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1142                 $text = preg_replace("/'''''/", "", $text);
1143                 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1144                 $text = preg_replace("/''/", "", $text);
1145
1146                 wfProfileOut( $fname );
1147                 return $text;
1148         }
1149
1150         /**
1151          * callback to replace [[target|caption]] kind of links, if
1152          * the target is category or image, leave it
1153          *
1154          * @param $matches Array
1155          */
1156         function linkReplace($matches){
1157                 $colon = strpos( $matches[1], ':' );
1158                 if( $colon === false )
1159                         return $matches[2]; // replace with caption
1160                 global $wgContLang;
1161                 $ns = substr( $matches[1], 0, $colon );
1162                 $index = $wgContLang->getNsIndex($ns);
1163                 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1164                         return $matches[0]; // return the whole thing
1165                 else
1166                         return $matches[2];
1167
1168         }
1169
1170         /**
1171      * Simple & fast snippet extraction, but gives completely unrelevant
1172      * snippets
1173      *
1174      * @param $text String
1175      * @param $terms Array
1176      * @param $contextlines Integer
1177      * @param $contextchars Integer
1178      * @return String
1179      */
1180     public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1181         global $wgLang, $wgContLang;
1182         $fname = __METHOD__;
1183
1184         $lines = explode( "\n", $text );
1185
1186         $terms = implode( '|', $terms );
1187         $max = intval( $contextchars ) + 1;
1188         $pat1 = "/(.*)($terms)(.{0,$max})/i";
1189
1190         $lineno = 0;
1191
1192         $extract = "";
1193         wfProfileIn( "$fname-extract" );
1194         foreach ( $lines as $line ) {
1195             if ( 0 == $contextlines ) {
1196                 break;
1197             }
1198             ++$lineno;
1199             $m = array();
1200             if ( ! preg_match( $pat1, $line, $m ) ) {
1201                 continue;
1202             }
1203             --$contextlines;
1204             $pre = $wgContLang->truncate( $m[1], -$contextchars );
1205
1206             if ( count( $m ) < 3 ) {
1207                 $post = '';
1208             } else {
1209                 $post = $wgContLang->truncate( $m[3], $contextchars );
1210             }
1211
1212             $found = $m[2];
1213
1214             $line = htmlspecialchars( $pre . $found . $post );
1215             $pat2 = '/(' . $terms . ")/i";
1216             $line = preg_replace( $pat2,
1217               "<span class='searchmatch'>\\1</span>", $line );
1218
1219             $extract .= "${line}\n";
1220         }
1221         wfProfileOut( "$fname-extract" );
1222
1223         return $extract;
1224     }
1225
1226 }
1227
1228 /**
1229  * Dummy class to be used when non-supported Database engine is present.
1230  * @todo Fixme: dummy class should probably try something at least mildly useful,
1231  * such as a LIKE search through titles.
1232  * @ingroup Search
1233  */
1234 class SearchEngineDummy extends SearchEngine {
1235         // no-op
1236 }