Revert for now:
[lhc/web/wiklou.git] / includes / SearchEngine.php
1 <?php
2 /**
3 * Contain a class for special pages
4 * @addtogroup Search
5 */
6 class SearchEngine {
7 var $limit = 10;
8 var $offset = 0;
9 var $searchTerms = array();
10 var $namespaces = array( NS_MAIN );
11 var $showRedirects = false;
12
13 /**
14 * Perform a full text search query and return a result set.
15 * If title searches are not supported or disabled, return null.
16 *
17 * @param string $term - Raw search term
18 * @return SearchResultSet
19 * @access public
20 * @abstract
21 */
22 function searchText( $term ) {
23 return null;
24 }
25
26 /**
27 * Perform a title-only search query and return a result set.
28 * If title searches are not supported or disabled, return null.
29 *
30 * @param string $term - Raw search term
31 * @return SearchResultSet
32 * @access public
33 * @abstract
34 */
35 function searchTitle( $term ) {
36 return null;
37 }
38
39 /**
40 * If an exact title match can be find, or a very slightly close match,
41 * return the title. If no match, returns NULL.
42 *
43 * @param string $term
44 * @return Title
45 */
46 public static function getNearMatch( $searchterm ) {
47 global $wgContLang;
48
49 $allSearchTerms = array($searchterm);
50
51 if($wgContLang->hasVariants()){
52 $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
53 }
54
55 foreach($allSearchTerms as $term){
56
57 # Exact match? No need to look further.
58 $title = Title::newFromText( $term );
59 if (is_null($title))
60 return NULL;
61
62 if ( $title->getNamespace() == NS_SPECIAL || $title->exists() ) {
63 return $title;
64 }
65
66 # Now try all lower case (i.e. first letter capitalized)
67 #
68 $title = Title::newFromText( $wgContLang->lc( $term ) );
69 if ( $title->exists() ) {
70 return $title;
71 }
72
73 # Now try capitalized string
74 #
75 $title = Title::newFromText( $wgContLang->ucwords( $term ) );
76 if ( $title->exists() ) {
77 return $title;
78 }
79
80 # Now try all upper case
81 #
82 $title = Title::newFromText( $wgContLang->uc( $term ) );
83 if ( $title->exists() ) {
84 return $title;
85 }
86
87 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
88 $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
89 if ( $title->exists() ) {
90 return $title;
91 }
92
93 global $wgCapitalLinks, $wgContLang;
94 if( !$wgCapitalLinks ) {
95 // Catch differs-by-first-letter-case-only
96 $title = Title::newFromText( $wgContLang->ucfirst( $term ) );
97 if ( $title->exists() ) {
98 return $title;
99 }
100 $title = Title::newFromText( $wgContLang->lcfirst( $term ) );
101 if ( $title->exists() ) {
102 return $title;
103 }
104 }
105
106 // Give hooks a chance at better match variants
107 $title = null;
108 if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
109 return $title;
110 }
111 }
112
113 $title = Title::newFromText( $searchterm );
114
115 # Entering an IP address goes to the contributions page
116 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
117 || User::isIP( trim( $searchterm ) ) ) {
118 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
119 }
120
121
122 # Entering a user goes to the user page whether it's there or not
123 if ( $title->getNamespace() == NS_USER ) {
124 return $title;
125 }
126
127 # Go to images that exist even if there's no local page.
128 # There may have been a funny upload, or it may be on a shared
129 # file repository such as Wikimedia Commons.
130 if( $title->getNamespace() == NS_IMAGE ) {
131 $image = wfFindFile( $title );
132 if( $image ) {
133 return $title;
134 }
135 }
136
137 # MediaWiki namespace? Page may be "implied" if not customized.
138 # Just return it, with caps forced as the message system likes it.
139 if( $title->getNamespace() == NS_MEDIAWIKI ) {
140 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
141 }
142
143 # Quoted term? Try without the quotes...
144 $matches = array();
145 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
146 return SearchEngine::getNearMatch( $matches[1] );
147 }
148
149 return NULL;
150 }
151
152 public static function legalSearchChars() {
153 return "A-Za-z_'0-9\\x80-\\xFF\\-";
154 }
155
156 /**
157 * Set the maximum number of results to return
158 * and how many to skip before returning the first.
159 *
160 * @param int $limit
161 * @param int $offset
162 * @access public
163 */
164 function setLimitOffset( $limit, $offset = 0 ) {
165 $this->limit = intval( $limit );
166 $this->offset = intval( $offset );
167 }
168
169 /**
170 * Set which namespaces the search should include.
171 * Give an array of namespace index numbers.
172 *
173 * @param array $namespaces
174 * @access public
175 */
176 function setNamespaces( $namespaces ) {
177 $this->namespaces = $namespaces;
178 }
179
180 /**
181 * Parse some common prefixes: all (search everything)
182 * or namespace names
183 *
184 * @param string $query
185 */
186 function replacePrefixes( $query ){
187 global $wgContLang;
188
189 if( strpos($query,':') === false )
190 return $query; // nothing to do
191
192 $parsed = $query;
193 $allkeyword = wfMsgForContent('searchall').":";
194 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
195 $this->namespaces = null;
196 $parsed = substr($query,strlen($allkeyword));
197 } else if( strpos($query,':') !== false ) {
198 $prefix = substr($query,0,strpos($query,':'));
199 $index = $wgContLang->getNsIndex($prefix);
200 if($index !== false){
201 $this->namespaces = array($index);
202 $parsed = substr($query,strlen($prefix)+1);
203 }
204 }
205 if(trim($parsed) == '')
206 return $query; // prefix was the whole query
207
208 return $parsed;
209 }
210
211 /**
212 * Make a list of searchable namespaces and their canonical names.
213 * @return array
214 */
215 public static function searchableNamespaces() {
216 global $wgContLang;
217 $arr = array();
218 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
219 if( $ns >= NS_MAIN ) {
220 $arr[$ns] = $name;
221 }
222 }
223 return $arr;
224 }
225
226 /**
227 * Extract default namespaces to search from the given user's
228 * settings, returning a list of index numbers.
229 *
230 * @param User $user
231 * @return array
232 * @static
233 */
234 public static function userNamespaces( &$user ) {
235 $arr = array();
236 foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
237 if( $user->getOption( 'searchNs' . $ns ) ) {
238 $arr[] = $ns;
239 }
240 }
241 return $arr;
242 }
243
244 /**
245 * Find snippet highlight settings for a given user
246 *
247 * @param User $user
248 * @return array contextlines, contextchars
249 * @static
250 */
251 public static function userHighlightPrefs( &$user ){
252 //$contextlines = $user->getOption( 'contextlines', 5 );
253 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
254 $contextchars = $user->getOption( 'contextchars', 50 );
255 return array($contextlines, $contextchars);
256 }
257
258 /**
259 * An array of namespaces indexes to be searched by default
260 *
261 * @return array
262 * @static
263 */
264 public static function defaultNamespaces(){
265 global $wgNamespacesToBeSearchedDefault;
266
267 return array_keys($wgNamespacesToBeSearchedDefault, true);
268 }
269
270 /**
271 * Return a 'cleaned up' search string
272 *
273 * @return string
274 * @access public
275 */
276 function filter( $text ) {
277 $lc = $this->legalSearchChars();
278 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
279 }
280 /**
281 * Load up the appropriate search engine class for the currently
282 * active database backend, and return a configured instance.
283 *
284 * @return SearchEngine
285 */
286 public static function create() {
287 global $wgDBtype, $wgSearchType;
288 if( $wgSearchType ) {
289 $class = $wgSearchType;
290 } elseif( $wgDBtype == 'mysql' ) {
291 $class = 'SearchMySQL';
292 } else if ( $wgDBtype == 'postgres' ) {
293 $class = 'SearchPostgres';
294 } else if ( $wgDBtype == 'oracle' ) {
295 $class = 'SearchOracle';
296 } else {
297 $class = 'SearchEngineDummy';
298 }
299 $search = new $class( wfGetDB( DB_SLAVE ) );
300 $search->setLimitOffset(0,0);
301 return $search;
302 }
303
304 /**
305 * Create or update the search index record for the given page.
306 * Title and text should be pre-processed.
307 *
308 * @param int $id
309 * @param string $title
310 * @param string $text
311 * @abstract
312 */
313 function update( $id, $title, $text ) {
314 // no-op
315 }
316
317 /**
318 * Update a search index record's title only.
319 * Title should be pre-processed.
320 *
321 * @param int $id
322 * @param string $title
323 * @abstract
324 */
325 function updateTitle( $id, $title ) {
326 // no-op
327 }
328
329 /**
330 * Get OpenSearch suggestion template
331 *
332 * @return string
333 * @static
334 */
335 public static function getOpenSearchTemplate() {
336 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
337 if($wgOpenSearchTemplate)
338 return $wgOpenSearchTemplate;
339 else{
340 $ns = implode(',',SearchEngine::defaultNamespaces());
341 if(!$ns) $ns = "0";
342 return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
343 }
344 }
345
346 /**
347 * Get internal MediaWiki Suggest template
348 *
349 * @return string
350 * @static
351 */
352 public static function getMWSuggestTemplate() {
353 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
354 if($wgMWSuggestTemplate)
355 return $wgMWSuggestTemplate;
356 else
357 return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
358 }
359 }
360
361
362 /**
363 * @addtogroup Search
364 */
365 class SearchResultSet {
366 /**
367 * Fetch an array of regular expression fragments for matching
368 * the search terms as parsed by this engine in a text extract.
369 *
370 * @return array
371 * @access public
372 * @abstract
373 */
374 function termMatches() {
375 return array();
376 }
377
378 function numRows() {
379 return 0;
380 }
381
382 /**
383 * Return true if results are included in this result set.
384 * @return bool
385 * @abstract
386 */
387 function hasResults() {
388 return false;
389 }
390
391 /**
392 * Some search modes return a total hit count for the query
393 * in the entire article database. This may include pages
394 * in namespaces that would not be matched on the given
395 * settings.
396 *
397 * Return null if no total hits number is supported.
398 *
399 * @return int
400 * @access public
401 */
402 function getTotalHits() {
403 return null;
404 }
405
406 /**
407 * Some search modes return a suggested alternate term if there are
408 * no exact hits. Returns true if there is one on this set.
409 *
410 * @return bool
411 * @access public
412 */
413 function hasSuggestion() {
414 return false;
415 }
416
417 /**
418 * @return string suggested query, null if none
419 */
420 function getSuggestionQuery(){
421 return null;
422 }
423
424 /**
425 * @return string highlighted suggested query, '' if none
426 */
427 function getSuggestionSnippet(){
428 return '';
429 }
430
431 /**
432 * Return information about how and from where the results were fetched,
433 * should be useful for diagnostics and debugging
434 *
435 * @return string
436 */
437 function getInfo() {
438 return null;
439 }
440
441 /**
442 * Return a result set of hits on other (multiple) wikis associated with this one
443 *
444 * @return SearchResultSet
445 */
446 function getInterwikiResults() {
447 return null;
448 }
449
450 /**
451 * Check if there are results on other wikis
452 *
453 * @return boolean
454 */
455 function hasInterwikiResults() {
456 return $this->getInterwikiResults() != null;
457 }
458
459
460 /**
461 * Fetches next search result, or false.
462 * @return SearchResult
463 * @access public
464 * @abstract
465 */
466 function next() {
467 return false;
468 }
469
470 /**
471 * Frees the result set, if applicable.
472 * @ access public
473 */
474 function free() {
475 // ...
476 }
477 }
478
479
480 /**
481 * @addtogroup Search
482 */
483 class SearchResultTooMany {
484 ## Some search engines may bail out if too many matches are found
485 }
486
487
488 /**
489 * @addtogroup Search
490 */
491 class SearchResult {
492 var $mRevision = null;
493
494 function SearchResult( $row ) {
495 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
496 if( !is_null($this->mTitle) )
497 $this->mRevision = Revision::newFromTitle( $this->mTitle );
498 }
499
500 /**
501 * Check if this is result points to an invalid title
502 *
503 * @return boolean
504 * @access public
505 */
506 function isBrokenTitle(){
507 if( is_null($this->mTitle) )
508 return true;
509 return false;
510 }
511
512 /**
513 * Check if target page is missing, happens when index is out of date
514 *
515 * @return boolean
516 * @access public
517 */
518 function isMissingRevision(){
519 if( !$this->mRevision )
520 return true;
521 return false;
522 }
523
524 /**
525 * @return Title
526 * @access public
527 */
528 function getTitle() {
529 return $this->mTitle;
530 }
531
532 /**
533 * @return double or null if not supported
534 */
535 function getScore() {
536 return null;
537 }
538
539 /**
540 * Lazy initialization of article text from DB
541 */
542 protected function initText(){
543 if( !isset($this->mText) ){
544 $this->mText = $this->mRevision->getText();
545 }
546 }
547
548 /**
549 * @param array $terms terms to highlight
550 * @return string highlighted text snippet, null (and not '') if not supported
551 */
552 function getTextSnippet($terms){
553 global $wgUser;
554 $this->initText();
555 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
556 return $this->extractText( $this->mText, $terms, $contextlines, $contextchars);
557 }
558
559 /**
560 * Default implementation of snippet extraction
561 *
562 * @param string $text
563 * @param array $terms
564 * @param int $contextlines
565 * @param int $contextchars
566 * @return string
567 */
568 protected function extractText( $text, $terms, $contextlines, $contextchars ) {
569 global $wgLang, $wgContLang;
570 $fname = __METHOD__;
571
572 $lines = explode( "\n", $text );
573
574 $terms = implode( '|', $terms );
575 $terms = str_replace( '/', "\\/", $terms);
576 $max = intval( $contextchars ) + 1;
577 $pat1 = "/(.*)($terms)(.{0,$max})/i";
578
579 $lineno = 0;
580
581 $extract = "";
582 wfProfileIn( "$fname-extract" );
583 foreach ( $lines as $line ) {
584 if ( 0 == $contextlines ) {
585 break;
586 }
587 ++$lineno;
588 $m = array();
589 if ( ! preg_match( $pat1, $line, $m ) ) {
590 continue;
591 }
592 --$contextlines;
593 $pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
594
595 if ( count( $m ) < 3 ) {
596 $post = '';
597 } else {
598 $post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
599 }
600
601 $found = $m[2];
602
603 $line = htmlspecialchars( $pre . $found . $post );
604 $pat2 = '/(' . $terms . ")/i";
605 $line = preg_replace( $pat2,
606 "<span class='searchmatch'>\\1</span>", $line );
607
608 $extract .= "${line}\n";
609 }
610 wfProfileOut( "$fname-extract" );
611
612 return $extract;
613 }
614
615 /**
616 * @param array $terms terms to highlight
617 * @return string highlighted title, '' if not supported
618 */
619 function getTitleSnippet($terms){
620 return '';
621 }
622
623 /**
624 * @param array $terms terms to highlight
625 * @return string highlighted redirect name (redirect to this page), '' if none or not supported
626 */
627 function getRedirectSnippet($terms){
628 return '';
629 }
630
631 /**
632 * @return Title object for the redirect to this page, null if none or not supported
633 */
634 function getRedirectTitle(){
635 return null;
636 }
637
638 /**
639 * @return string highlighted relevant section name, null if none or not supported
640 */
641 function getSectionSnippet(){
642 return '';
643 }
644
645 /**
646 * @return Title object (pagename+fragment) for the section, null if none or not supported
647 */
648 function getSectionTitle(){
649 return null;
650 }
651
652 /**
653 * @return string timestamp
654 */
655 function getTimestamp(){
656 return $this->mRevision->getTimestamp();
657 }
658
659 /**
660 * @return int number of words
661 */
662 function getWordCount(){
663 $this->initText();
664 return str_word_count( $this->mText );
665 }
666
667 /**
668 * @return int size in bytes
669 */
670 function getByteSize(){
671 $this->initText();
672 return strlen( $this->mText );
673 }
674
675 /**
676 * @return boolean if hit has related articles
677 */
678 function hasRelated(){
679 return false;
680 }
681
682 /**
683 * @return interwiki prefix of the title (return iw even if title is broken)
684 */
685 function getInterwikiPrefix(){
686 return '';
687 }
688 }
689
690 /**
691 * @addtogroup Search
692 */
693 class SearchEngineDummy {
694 function search( $term ) {
695 return null;
696 }
697 function setLimitOffset($l, $o) {}
698 function legalSearchChars() {}
699 function update() {}
700 function setnamespaces() {}
701 function searchtitle() {}
702 function searchtext() {}
703 }