5f93ba8a54b7e6d492fd0385bbae8694cb151d40
[lhc/web/wiklou.git] / includes / SearchEngine.php
1 <?php
2 /**
3 * Contain site class
4 * See search.doc
5 */
6
7 /**
8 *
9 */
10 define( 'MW_SEARCH_OK', true );
11 define( 'MW_SEARCH_BAD_QUERY', false );
12
13 /**
14 * @todo document
15 */
16 class SearchEngine {
17 /* private */ var $rawText, $filteredText, $searchTerms;
18 /* private */ var $titleCond, $textCond;
19
20 var $doSearchRedirects = true;
21 var $addToQuery = array();
22 var $namespacesToSearch = array();
23 var $alternateTitle;
24 var $allTitles = false;
25
26 function SearchEngine( $text ) {
27 $this->rawText = trim( $text );
28
29 # We display the query, so let's strip it for safety
30 #
31 global $wgDBmysql4;
32 $lc = SearchEngine::legalSearchChars() . '()';
33 if( $wgDBmysql4 ) {
34 $lc .= "\"~<>*+-";
35 }
36 $this->filteredText = trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
37 $this->searchTerms = array();
38 $this->strictMatching = true; # Google-style, add '+' on all terms
39
40 $this->db =& wfGetDB( DB_SLAVE );
41 }
42
43 /**
44 * Return a partial WHERE clause to limit the search to the given namespaces
45 */
46 function queryNamespaces() {
47 $namespaces = implode( ',', $this->namespacesToSearch );
48 if ($namespaces == '') {
49 $namespaces = '0';
50 }
51 return "AND cur_namespace IN (" . $namespaces . ')';
52 }
53
54 /**
55 * Return a partial WHERE clause to include or exclude redirects from results
56 */
57 function searchRedirects() {
58 if ( $this->doSearchRedirects ) {
59 return '';
60 } else {
61 return 'AND cur_is_redirect=0 ';
62 }
63 }
64
65 /**
66 * @access private
67 */ function initNamespaceCheckbox( $i ) {
68 global $wgUser, $wgNamespacesToBeSearchedDefault;
69
70 if ($wgUser->getID()) {
71 // User is logged in so we retrieve his default namespaces
72 return $wgUser->getOption( 'searchNs'.$i );
73 } else {
74 // User is not logged in so we give him the global default namespaces
75 return !empty($wgNamespacesToBeSearchedDefault[ $i ]);
76 }
77 }
78
79 /**
80 * Display the "power search" footer. Does not actually perform the search,
81 * that is done by showResults()
82 */
83 function powersearch() {
84 global $wgUser, $wgOut, $wgLang, $wgTitle, $wgRequest;
85 $sk =& $wgUser->getSkin();
86
87 $search = $this->rawText;
88 $searchx = $wgRequest->getVal( 'searchx' );
89 $listredirs = $wgRequest->getVal( 'redirs' );
90
91 $ret = wfMsg('powersearchtext'); # Text to be returned
92 $tempText = ''; # Temporary text, for substitution into $ret
93
94 if( isset( $_REQUEST['searchx'] ) ) {
95 $this->addToQuery['searchx'] = '1';
96 }
97
98 # Do namespace checkboxes
99 $namespaces = $wgLang->getNamespaces();
100 foreach ( $namespaces as $i => $namespace ) {
101 # Skip virtual namespaces
102 if ( $i < 0 ) {
103 continue;
104 }
105
106 $formVar = 'ns'.$i;
107
108 # Initialise checkboxValues, either from defaults or from
109 # a previous invocation
110 if ( !isset( $searchx ) ) {
111 $checkboxValue = $this->initNamespaceCheckbox( $i );
112 } else {
113 $checkboxValue = $wgRequest->getVal( $formVar );
114 }
115
116 $checked = '';
117 if ( $checkboxValue == 1 ) {
118 $checked = ' checked="checked"';
119 $this->addToQuery['ns'.$i] = 1;
120 array_push( $this->namespacesToSearch, $i );
121 }
122 $name = str_replace( '_', ' ', $namespaces[$i] );
123 if ( '' == $name ) {
124 $name = wfMsg( 'blanknamespace' );
125 }
126
127 if ( $tempText !== '' ) {
128 $tempText .= ' ';
129 }
130 $tempText .= "<input type='checkbox' value=\"1\" name=\"" .
131 "ns{$i}\"{$checked} />{$name}\n";
132 }
133 $ret = str_replace ( '$1', $tempText, $ret );
134
135 # List redirects checkbox
136
137 $checked = '';
138 if ( $listredirs == 1 ) {
139 $this->addToQuery['redirs'] = 1;
140 $checked = ' checked="checked"';
141 }
142 $tempText = "<input type='checkbox' value='1' name=\"redirs\"{$checked} />\n";
143 $ret = str_replace( '$2', $tempText, $ret );
144
145 # Search field
146
147 $tempText = "<input type='text' name=\"search\" value=\"" .
148 htmlspecialchars( $search ) ."\" width=\"80\" />\n";
149 $ret = str_replace( "$3", $tempText, $ret );
150
151 # Searchx button
152
153 $tempText = '<input type="submit" name="searchx" value="' .
154 wfMsg('powersearch') . "\" />\n";
155 $ret = str_replace( '$9', $tempText, $ret );
156
157 $action = $sk->escapeSearchLink();
158 $ret = "<br /><br />\n<form id=\"powersearch\" method=\"get\" " .
159 "action=\"$action\">\n{$ret}\n</form>\n";
160
161 if ( isset ( $searchx ) ) {
162 if ( ! $listredirs ) {
163 $this->doSearchRedirects = false;
164 }
165 }
166 return $ret;
167 }
168
169 function setupPage() {
170 global $wgOut;
171 $wgOut->setPageTitle( wfMsg( 'searchresults' ) );
172 $wgOut->setSubtitle( wfMsg( 'searchquery', htmlspecialchars( $this->rawText ) ) );
173 $wgOut->setArticleRelated( false );
174 $wgOut->setRobotpolicy( 'noindex,nofollow' );
175 }
176
177 /**
178 * Perform the search and construct the results page
179 */
180 function showResults() {
181 global $wgUser, $wgTitle, $wgOut, $wgLang;
182 global $wgDisableTextSearch, $wgInputEncoding;
183 $fname = 'SearchEngine::showResults';
184
185 $search = $this->rawText;
186
187 $powersearch = $this->powersearch(); /* Need side-effects here? */
188
189 $this->setupPage();
190
191 $sk = $wgUser->getSkin();
192 $wgOut->addWikiText( wfMsg( 'searchresulttext' ) );
193
194 if ( !$this->parseQuery() ) {
195 $wgOut->addWikiText(
196 '==' . wfMsg( 'badquery' ) . "==\n" .
197 wfMsg( 'badquerytext' ) );
198 return;
199 }
200 list( $limit, $offset ) = wfCheckLimits( 20, 'searchlimit' );
201
202 if ( $wgDisableTextSearch ) {
203 $wgOut->addHTML( wfMsg( 'searchdisabled' ) );
204 $wgOut->addHTML( wfMsg( 'googlesearch',
205 htmlspecialchars( $this->rawText ),
206 htmlspecialchars( $wgInputEncoding ) ) );
207 return;
208 }
209
210 $titleMatches = $this->getMatches( $this->titleCond, $limit, $offset );
211 $textMatches = $this->getMatches( $this->textCond, $limit, $offset );
212
213 $sk = $wgUser->getSkin();
214
215 $num = count( $titleMatches ) + count( $textMatches );
216 if ( $num >= $limit ) {
217 $top = wfShowingResults( $offset, $limit );
218 } else {
219 $top = wfShowingResultsNum( $offset, $limit, $num );
220 }
221 $wgOut->addHTML( "<p>{$top}</p>\n" );
222
223 # For powersearch
224 $a2l = '';
225 $akk = array_keys( $this->addToQuery );
226 foreach ( $akk AS $ak ) {
227 $a2l .= "&{$ak}={$this->addToQuery[$ak]}" ;
228 }
229
230 $prevnext = wfViewPrevNext( $offset, $limit, '',
231 'search=' . wfUrlencode( $this->filteredText ) . $a2l );
232 $wgOut->addHTML( "<br />{$prevnext}\n" );
233
234 $foundsome = $this->showMatches( $titleMatches, $offset, 'notitlematches', 'titlematches' )
235 || $this->showMatches( $textMatches, $offset, 'notextmatches', 'textmatches' );
236
237 if ( !$foundsome ) {
238 $wgOut->addWikiText( wfMsg( 'nonefound' ) );
239 }
240 $wgOut->addHTML( "<p>{$prevnext}</p>\n" );
241 $wgOut->addHTML( $powersearch );
242 }
243
244 function legalSearchChars() {
245 $lc = "A-Za-z_'0-9\\x80-\\xFF\\-";
246 return $lc;
247 }
248
249 function parseQuery() {
250 global $wgDBmysql4;
251 if( $wgDBmysql4 ) {
252 # Use cleaner boolean search if available
253 return $this->parseQuery4();
254 } else {
255 # Fall back to ugly hack with multiple search clauses
256 return $this->parseQuery3();
257 }
258 }
259
260 function parseQuery3() {
261 global $wgDBminWordLen, $wgLang;
262
263 # on non mysql4 database: get list of words we don't want to search for
264 require_once( 'FulltextStoplist.php' );
265
266 $lc = SearchEngine::legalSearchChars() . '()';
267 $q = preg_replace( "/([()])/", " \\1 ", $this->filteredText );
268 $q = preg_replace( "/\\s+/", " ", $q );
269 $w = explode( ' ', trim( $q ) );
270
271 $last = $cond = '';
272 foreach ( $w as $word ) {
273 $word = $wgLang->stripForSearch( $word );
274 if ( 'and' == $word || 'or' == $word || 'not' == $word
275 || '(' == $word || ')' == $word ) {
276 $cond .= ' ' . strtoupper( $word );
277 $last = '';
278 } else if ( strlen( $word ) < $wgDBminWordLen ) {
279 continue;
280 } else if ( FulltextStoplist::inList( $word ) ) {
281 continue;
282 } else {
283 if ( '' != $last ) { $cond .= ' AND'; }
284 $cond .= " (MATCH (##field##) AGAINST ('" .
285 $this->db->strencode( $word ). "'))";
286 $last = $word;
287 array_push( $this->searchTerms, "\\b" . $word . "\\b" );
288 }
289 }
290 if ( 0 == count( $this->searchTerms ) ) {
291 return MW_SEARCH_BAD_QUERY;
292 }
293
294 $this->titleCond = '(' . str_replace( '##field##',
295 'si_title', $cond ) . ' )';
296
297 $this->textCond = '(' . str_replace( '##field##',
298 'si_text', $cond ) . ' AND (cur_is_redirect=0) )';
299
300 return MW_SEARCH_OK;
301 }
302
303 function parseQuery4() {
304 global $wgLang;
305 $lc = SearchEngine::legalSearchChars();
306 $searchon = '';
307 $this->searchTerms = array();
308
309 # FIXME: This doesn't handle parenthetical expressions.
310 if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/',
311 $this->filteredText, $m, PREG_SET_ORDER ) ) {
312 foreach( $m as $terms ) {
313 if( $searchon !== '' ) $searchon .= ' ';
314 if( $this->strictMatching && ($terms[1] == '') ) {
315 $terms[1] = '+';
316 }
317 $searchon .= $terms[1] . $wgLang->stripForSearch( $terms[2] );
318 if( !empty( $terms[3] ) ) {
319 $regexp = preg_quote( $terms[3] );
320 if( $terms[4] ) $regexp .= "[0-9A-Za-z_]+";
321 } else {
322 $regexp = preg_quote( str_replace( '"', '', $terms[2] ) );
323 }
324 $this->searchTerms[] = $regexp;
325 }
326 wfDebug( "Would search with '$searchon'\n" );
327 wfDebug( "Match with /\b" . implode( '\b|\b', $this->searchTerms ) . "\b/\n" );
328 } else {
329 wfDebug( "Can't understand search query '{$this->filteredText}'\n" );
330 }
331
332 $searchon = $this->db->strencode( $searchon );
333 $this->titleCond = " MATCH(si_title) AGAINST('$searchon' IN BOOLEAN MODE)";
334 $this->textCond = " (MATCH(si_text) AGAINST('$searchon' IN BOOLEAN MODE) AND cur_is_redirect=0)";
335 return MW_SEARCH_OK;
336 }
337
338 function &getMatches( $cond, $limit, $offset = 0 ) {
339 $searchindex = $this->db->tableName( 'searchindex' );
340 $cur = $this->db->tableName( 'cur' );
341 $searchnamespaces = $this->queryNamespaces();
342 $redircond = $this->searchRedirects();
343
344 $sql = "SELECT cur_id,cur_namespace,cur_title," .
345 "cur_text FROM $cur,$searchindex " .
346 "WHERE cur_id=si_page AND {$cond} " .
347 "{$searchnamespaces} {$redircond} " .
348 $this->db->limitResult( $limit, $offset );
349
350 $res = $this->db->query( $sql, 'SearchEngine::getMatches' );
351 $matches = array();
352 while ( $row = $this->db->fetchObject( $res ) ) {
353 $matches[] = $row;
354 }
355 $this->db->freeResult( $res );
356
357 return $matches;
358 }
359
360 function showMatches( &$matches, $offset, $msgEmpty, $msgFound ) {
361 global $wgOut;
362 if ( 0 == count( $matches ) ) {
363 $wgOut->addHTML( "<h2>" . wfMsg( $msgEmpty ) .
364 "</h2>\n" );
365 return false;
366 } else {
367 $off = $offset + 1;
368 $wgOut->addHTML( "<h2>" . wfMsg( $msgFound ) .
369 "</h2>\n<ol start='{$off}'>" );
370
371 foreach( $matches as $row ) {
372 $this->showHit( $row );
373 }
374 $wgOut->addHTML( "</ol>\n" );
375 return true;
376 }
377 }
378
379 function showHit( $row ) {
380 global $wgUser, $wgOut, $wgLang;
381
382 $t = Title::makeName( $row->cur_namespace, $row->cur_title );
383 if( is_null( $t ) ) {
384 $wgOut->addHTML( "<!-- Broken link in search result -->\n" );
385 return;
386 }
387 $sk = $wgUser->getSkin();
388
389 $contextlines = $wgUser->getOption( 'contextlines' );
390 if ( '' == $contextlines ) { $contextlines = 5; }
391 $contextchars = $wgUser->getOption( 'contextchars' );
392 if ( '' == $contextchars ) { $contextchars = 50; }
393
394 $link = $sk->makeKnownLink( $t, '' );
395 $size = wfMsg( 'nbytes', strlen( $row->cur_text ) );
396 $wgOut->addHTML( "<li>{$link} ({$size})" );
397
398 $lines = explode( "\n", $row->cur_text );
399 $pat1 = "/(.*)(" . implode( "|", $this->searchTerms ) . ")(.*)/i";
400 $lineno = 0;
401
402 foreach ( $lines as $line ) {
403 if ( 0 == $contextlines ) {
404 break;
405 }
406 --$contextlines;
407 ++$lineno;
408 if ( ! preg_match( $pat1, $line, $m ) ) {
409 continue;
410 }
411
412 $pre = $wgLang->truncate( $m[1], -$contextchars, '...' );
413
414 if ( count( $m ) < 3 ) {
415 $post = '';
416 } else {
417 $post = $wgLang->truncate( $m[3], $contextchars, '...' );
418 }
419
420 $found = $m[2];
421
422 $line = htmlspecialchars( $pre . $found . $post );
423 $pat2 = '/(' . implode( '|', $this->searchTerms ) . ")/i";
424 $line = preg_replace( $pat2,
425 "<span class='searchmatch'>\\1</span>", $line );
426
427 $wgOut->addHTML( "<br /><small>{$lineno}: {$line}</small>\n" );
428 }
429 $wgOut->addHTML( "</li>\n" );
430 }
431
432 function getNearMatch() {
433 # Exact match? No need to look further.
434 $title = Title::newFromText( $this->rawText );
435 if ( $title->getNamespace() == NS_SPECIAL || 0 != $title->getArticleID() ) {
436 return $title;
437 }
438
439 # Now try all lower case (i.e. first letter capitalized)
440 #
441 $title = Title::newFromText( strtolower( $this->rawText ) );
442 if ( 0 != $title->getArticleID() ) {
443 return $title;
444 }
445
446 # Now try capitalized string
447 #
448 $title = Title::newFromText( ucwords( strtolower( $this->rawText ) ) );
449 if ( 0 != $title->getArticleID() ) {
450 return $title;
451 }
452
453 # Now try all upper case
454 #
455 $title = Title::newFromText( strtoupper( $this->rawText ) );
456 if ( 0 != $title->getArticleID() ) {
457 return $title;
458 }
459
460 # Entering an IP address goes to the contributions page
461 if ( preg_match( '/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/', $this->rawText ) ) {
462 $title = Title::makeTitle( NS_SPECIAL, "Contributions/" . $this->rawText );
463 return $title;
464 }
465
466 return NULL;
467 }
468
469 function goResult() {
470 global $wgOut, $wgGoToEdit;
471 global $wgDisableTextSearch;
472 $fname = 'SearchEngine::goResult';
473
474 # Try to go to page as entered.
475 #
476 $t = Title::newFromText( $this->rawText );
477
478 # If the string cannot be used to create a title
479 if( is_null( $t ) ){
480 $this->showResults();
481 return;
482 }
483
484 # If there's an exact or very near match, jump right there.
485 $t = $this->getNearMatch();
486 if( !is_null( $t ) ) {
487 $wgOut->redirect( $t->getFullURL() );
488 return;
489 }
490
491 # No match, generate an edit URL
492 $t = Title::newFromText( $this->rawText );
493
494 # If the feature is enabled, go straight to the edit page
495 if ( $wgGoToEdit ) {
496 $wgOut->redirect( $t->getFullURL( 'action=edit' ) );
497 return;
498 }
499
500 if( $t ) {
501 $editurl = $t->escapeLocalURL( 'action=edit' );
502 } else {
503 $editurl = ''; # ??
504 }
505 $wgOut->addHTML( '<p>' . wfMsg('nogomatch', $editurl ) . "</p>\n" );
506
507 # Try a fuzzy title search
508 $anyhit = false;
509 global $wgDisableFuzzySearch;
510 if(! $wgDisableFuzzySearch ){
511 foreach( array(NS_MAIN, NS_PROJECT, NS_USER, NS_IMAGE, NS_MEDIAWIKI) as $namespace){
512 $anyhit |= SearchEngine::doFuzzyTitleSearch( $this->rawText, $namespace );
513 }
514 }
515
516 if( ! $anyhit ){
517 return $this->showResults();
518 }
519 }
520
521 /**
522 * @static
523 */
524 function doFuzzyTitleSearch( $search, $namespace ){
525 global $wgLang, $wgOut;
526
527 $this->setupPage();
528
529 $sstr = ucfirst($search);
530 $sstr = str_replace(' ', '_', $sstr);
531 $fuzzymatches = SearchEngine::fuzzyTitles( $sstr, $namespace );
532 $fuzzymatches = array_slice($fuzzymatches, 0, 10);
533 $slen = strlen( $search );
534 $wikitext = '';
535 foreach($fuzzymatches as $res){
536 $t = str_replace('_', ' ', $res[1]);
537 $tfull = $wgLang->getNsText( $namespace ) . ":$t|$t";
538 if( $namespace == NS_MAIN )
539 $tfull = "$t";
540 $distance = $res[0];
541 $closeness = (strlen( $search ) - $distance) / strlen( $search );
542 $percent = intval( $closeness * 100 ) . '%';
543 $stars = str_repeat('*', ceil(5 * $closeness) );
544 $wikitext .= "* [[$tfull]] $percent ($stars)\n";
545 }
546 if( $wikitext ){
547 if( $namespace != NS_MAIN )
548 $wikitext = '=== ' . $wgLang->getNsText( $namespace ) . " ===\n" . $wikitext;
549 $wgOut->addWikiText( $wikitext );
550 return true;
551 }
552 return false;
553 }
554
555 /**
556 * @static
557 */
558 function fuzzyTitles( $sstr, $namespace = NS_MAIN ){
559 $span = 0.10; // weed on title length before doing levenshtein.
560 $tolerance = 0.35; // allowed percentage of erronous characters
561 $slen = strlen($sstr);
562 $tolerance_count = ceil($tolerance * $slen);
563 $spanabs = ceil($slen * (1 + $span)) - $slen;
564 # print "Word: $sstr, len = $slen, range = [$min, $max], tolerance_count = $tolerance_count<BR>\n";
565 $result = array();
566 $cnt = 0;
567 for( $i=0; $i <= $spanabs; $i++ ){
568 $titles = SearchEngine::getTitlesByLength( $slen + $i, $namespace );
569 if( $i != 0) {
570 $titles = array_merge($titles, SearchEngine::getTitlesByLength( $slen - $i, $namespace ) );
571 }
572 foreach($titles as $t){
573 $d = levenshtein($sstr, $t);
574 if($d < $tolerance_count)
575 $result[] = array($d, $t);
576 $cnt++;
577 }
578 }
579 usort($result, 'SearchEngine_pcmp');
580 return $result;
581 }
582
583 /**
584 * static
585 */
586 function getTitlesByLength($aLength, $aNamespace = 0){
587 global $wgMemc, $wgDBname;
588 $fname = 'SearchEngin::getTitlesByLength';
589
590 // to avoid multiple costly SELECTs in case of no memcached
591 if( $this->allTitles ){
592 if( isset( $this->allTitles[$aLength][$aNamespace] ) ){
593 return $this->allTitles[$aLength][$aNamespace];
594 } else {
595 return array();
596 }
597 }
598
599 $mkey = "$wgDBname:titlesbylength:$aLength:$aNamespace";
600 $mkeyts = "$wgDBname:titlesbylength:createtime";
601 $ts = $wgMemc->get( $mkeyts );
602 $result = $wgMemc->get( $mkey );
603
604 if( time() - $ts < 3600 ){
605 // note: in case of insufficient memcached space, we return
606 // an empty list instead of starting to hit the DB.
607 return is_array( $result ) ? $result : array();
608 }
609
610 $wgMemc->set( $mkeyts, time() );
611
612 $res = $this->db->select( 'cur', array( 'cur_title', 'cur_namespace' ), false, $fname );
613 $titles = array(); // length, ns, [titles]
614 while( $obj = $this->db->fetchObject( $res ) ){
615 $title = $obj->cur_title;
616 $ns = $obj->cur_namespace;
617 $len = strlen( $title );
618 $titles[$len][$ns][] = $title;
619 }
620 foreach($titles as $length => $length_arr){
621 foreach($length_arr as $ns => $title_arr){
622 $mkey = "$wgDBname:titlesbylength:$length:$ns";
623 $wgMemc->set( $mkey, $title_arr, 3600 * 24 );
624 }
625 }
626 $this->allTitles = $titles;
627 if( isset( $titles[$aLength][$aNamespace] ) )
628 return $titles[$aLength][$aNamespace];
629 else
630 return array();
631 }
632 }
633
634 /**
635 * @access private
636 * @static
637 */
638 function SearchEngine_pcmp($a, $b){ return $a[0] - $b[0]; }
639
640 ?>