* Fixed magic quotes in $_REQUEST, in Setup.php
[lhc/web/wiklou.git] / includes / SearchEngine.php
index a95dd7f..ee38899 100644 (file)
@@ -1,4 +1,4 @@
-<?
+<?php
 # See search.doc
 
 class SearchEngine {
@@ -9,6 +9,7 @@ class SearchEngine {
        var $addtoquery = array();
        var $namespacesToSearch = array();
        var $alternateTitle;
+       var $all_titles = false;
 
        function SearchEngine( $text )
        {
@@ -19,6 +20,7 @@ class SearchEngine {
                if( $wgDBmysql4 ) $lc .= "\"~<>*+-";
                $this->mUsertext = trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
                $this->mSearchterms = array();
+               $this->mStrictMatching = true; # Google-style, add '+' on all terms
        }
 
        function queryNamespaces()
@@ -28,25 +30,25 @@ class SearchEngine {
                        $namespaces = "0";
                }
                return "AND cur_namespace IN (" . $namespaces . ")";
-               #return "1";
        }
 
        function searchRedirects()
        {
-               if ( $this->doSearchRedirects ) return "";
-               return "AND cur_is_redirect=0 ";
+               if ( $this->doSearchRedirects ) {
+                       return "";
+               } else {
+                       return "AND cur_is_redirect=0 ";
+               }
        }
 
        /* private */ function initNamespaceCheckbox( $i )
        {
                global $wgUser, $wgNamespacesToBeSearchedDefault;
                
-
                if ($wgUser->getID()) {
                        // User is logged in so we retrieve his default namespaces
                        return $wgUser->getOption( "searchNs".$i );
-               }
-               else {  
+               } else {
                        // User is not logged in so we give him the global default namespaces
                        return $wgNamespacesToBeSearchedDefault[ $i ];
                }
@@ -128,8 +130,10 @@ class SearchEngine {
                  wfMsg("powersearch") . "\">\n";
                $ret = str_replace( "$9", $tempText, $ret );
 
+               $titleObj = NULL; # this does tricky stuff
+               
                $ret = "<br><br>\n<form id=\"powersearch\" method=\"get\" " .
-                 "action=\"" . wfLocalUrl( "" ) . "\">\n{$ret}\n</form>\n";
+                 "action=\"" . $titleObj->getUrl() . "\">\n{$ret}\n</form>\n";
 
                if ( isset ( $searchx ) ) {
                        if ( ! $listredirs ) { 
@@ -146,20 +150,20 @@ class SearchEngine {
                global $wgInputEncoding;
                $fname = "SearchEngine::showResults";
 
-               $search         = $_REQUEST['search'];
+               $search = $_REQUEST['search'];
 
                $powersearch = $this->powersearch(); /* Need side-effects here? */
 
                $wgOut->setPageTitle( wfMsg( "searchresults" ) );
                $q = wfMsg( "searchquery", htmlspecialchars( $this->mUsertext ) );
                $wgOut->setSubtitle( $q );
-               $wgOut->setArticleFlag( false );
+               $wgOut->setArticleRelated( false );
                $wgOut->setRobotpolicy( "noindex,nofollow" );
 
                $sk = $wgUser->getSkin();
-               $text = wfMsg( "searchresulttext", $sk->makeKnownLink(
+               $header = wfMsg( "searchresulttext", $sk->makeKnownLink(
                  wfMsg( "searchhelppage" ), wfMsg( "searchingwikipedia" ) ) );
-               $wgOut->addHTML( $text );
+               $wgOut->addHTML( $header );
 
                $this->parseQuery();
                if ( "" == $this->mTitlecond || "" == $this->mTextcond ) {
@@ -173,7 +177,8 @@ class SearchEngine {
                $redircond = $this->searchRedirects();
 
                if ( $wgDisableTextSearch ) {
-                       $wgOut->addHTML( wfMsg( "searchdisabled", htmlspecialchars( $search ), $wgInputEncoding ) );
+                       $wgOut->addHTML( wfMsg( "searchdisabled" ) );
+                       $wgOut->addHTML( wfMsg( "googlesearch", htmlspecialchars( $search ), $GLOBALS['wgInputEncoding'] ) );
                } else {
                        $sql = "SELECT cur_id,cur_namespace,cur_title," .
                          "cur_text FROM cur,searchindex " .
@@ -184,10 +189,8 @@ class SearchEngine {
                        $num = wfNumRows($res1);
 
                        $sk = $wgUser->getSkin();
-                       $text = wfMsg( "searchresulttext", $sk->makeKnownLink(
-                         wfMsg( "searchhelppage" ), wfMsg( "searchingwikipedia" ) ) );
-                       $wgOut->addHTML( $text );
-       
+                       $text = "";
+
                        $this->parseQuery();
                        if ( "" == $this->mTitlecond || "" == $this->mTextcond ) {
                                $wgOut->addHTML( "<h2>" . wfMsg( "badquery" ) . "</h2>\n" .
@@ -310,7 +313,6 @@ class SearchEngine {
                                $cond .= " (MATCH (##field##) AGAINST ('" .
                                  wfStrencode( $word ). "'))";
                                $last = $word;
-                               $word = preg_quote( $word );
                                array_push( $this->mSearchterms, "\\b" . $word . "\\b" );
                        }
                }
@@ -325,21 +327,37 @@ class SearchEngine {
        
        function parseQuery4()
        {
-               # FIXME: not ready yet! Do not use.
-               
                global $wgLang;
                $lc = SearchEngine::legalSearchChars();
-               #$q = preg_replace( "/([+-]?)([$lc]+)/e",
-               #       "\"$1\" . \$wgLang->stripForSearch(\"$2\")",
-               #       $this->mUsertext );
-               
-               $q = $this->mUsertext;
-               $qq = wfStrencode( $wgLang->stripForSearch( $q ) );
-               $this->mSearchterms = preg_split( '/\s+/', $q );
-               $this->mSearchterms = array_map( "preg_quote", $this->mSearchterms );
+               $searchon = "";
+               $this->mSearchterms = array();
+
+               # FIXME: This doesn't handle parenthetical expressions.
+               if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/',
+                         $this->mUsertext, $m, PREG_SET_ORDER ) ) {
+                       foreach( $m as $terms ) {
+                               if( $searchon !== "" ) $searchon .= " ";
+                               if( $this->mStrictMatching && ($terms[1] == "") ) {
+                                       $terms[1] = "+";
+                               }
+                               $searchon .= $terms[1] . $wgLang->stripForSearch( $terms[2] );
+                               if( $terms[3] ) {
+                                       $regexp = preg_quote( $terms[3] );
+                                       if( $terms[4] ) $regexp .= "[0-9A-Za-z_]+";
+                               } else {
+                                       $regexp = preg_quote( str_replace( '"', '', $terms[2] ) );
+                               }
+                               $this->mSearchterms[] = $regexp;
+                       }
+                       wfDebug( "Would search with '$searchon'\n" );
+                       wfDebug( "Match with /\b" . implode( '\b|\b', $this->mSearchterms ) . "\b/\n" );
+               } else {
+                       wfDebug( "Can't understand search query '$this->mUsertext'\n" );
+               }
                
-               $this->mTitlecond = " MATCH(si_title) AGAINST('$qq' IN BOOLEAN MODE)";
-               $this->mTextcond = " (MATCH(si_text) AGAINST('$qq' IN BOOLEAN MODE) AND cur_is_redirect=0)";
+               $searchon = wfStrencode( $searchon );
+               $this->mTitlecond = " MATCH(si_title) AGAINST('$searchon' IN BOOLEAN MODE)";
+               $this->mTextcond = " (MATCH(si_text) AGAINST('$searchon' IN BOOLEAN MODE) AND cur_is_redirect=0)";
        }
 
        function showHit( $row )
@@ -366,7 +384,6 @@ class SearchEngine {
                        if ( 0 == $contextlines ) { break; }
                        --$contextlines;
                        ++$lineno;
-                       wfDebug( "Search highlight pattern is '$pat1'\n" );
                        if ( ! preg_match( $pat1, $line, $m ) ) { continue; }
 
                        $pre = $m[1];
@@ -418,7 +435,7 @@ class SearchEngine {
                }
 
                if ( 0 != $t->getArticleID() ) {
-                       $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) );
+                       $wgOut->redirect( $t->getURL() );
                        return;
                }
 
@@ -426,7 +443,7 @@ class SearchEngine {
                #
                $t = Title::newFromText( strtolower( $search ) );
                if ( 0 != $t->getArticleID() ) {
-                       $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) );
+                       $wgOut->redirect( $t->getURL() );
                        return;
                }
 
@@ -434,7 +451,7 @@ class SearchEngine {
                #
                $t = Title::newFromText( ucwords( strtolower( $search ) ) );
                if ( 0 != $t->getArticleID() ) {
-                       $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) );
+                       $wgOut->redirect( $t->getURL() );
                        return;
                }
 
@@ -442,33 +459,126 @@ class SearchEngine {
                #
                $t = Title::newFromText( strtoupper( $search ) );
                if ( 0 != $t->getArticleID() ) {
-                       $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) );
+                       $wgOut->redirect( $t->getURL() );
                        return;
                }
 
-               # Try a near match
-               #
-               if( !$wgDisableTextSearch ) {
-                       $this->parseQuery();                                                                            
-                       $sql = "SELECT cur_id,cur_title,cur_namespace,si_page FROM cur,searchindex " .
-                         "WHERE cur_id=si_page AND {$this->mTitlecond} ORDER BY cur_namespace LIMIT 1";
-       
-                       if ( "" != $this->mTitlecond ) {
-                               $res = wfQuery( $sql, DB_READ, $fname );
-                       }                               
-                       if ( isset( $res ) && 0 != wfNumRows( $res ) ) {
-                               $s = wfFetchObject( $res );
-       
-                               $t = Title::makeTitle( $s->cur_namespace, $s->cur_title );
-                               $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) );
-                               return;
+               # No match, generate an edit URL
+               $t = Title::newFromText( $this->mUsertext );
+               $wgOut->addHTML( wfMsg("nogomatch", $t->getURL( "action=edit", true ) ) . "\n<p>" );
+
+               # Try a fuzzy title search
+               $anyhit = false;
+               global $wgDisableFuzzySearch;
+               if(! $wgDisableFuzzySearch ){
+                       foreach( array(NS_MAIN, NS_WP, NS_USER, NS_IMAGE, NS_MEDIAWIKI) as $namespace){
+                               $anyhit |= SearchEngine::doFuzzyTitleSearch( $search, $namespace );
                        }
                }
-               $wgOut->addHTML( wfMsg("nogomatch", 
-                 htmlspecialchars( wfLocalUrl( ucfirst($this->mUsertext), "action=edit") ) )
-                 . "\n<p>" );
-               $this->showResults();
+               
+               if( ! $anyhit ){
+                       return $this->showResults();
+               }
+       }
+
+       /* static */ function doFuzzyTitleSearch( $search, $namespace ){
+               global $wgLang, $wgOut;
+               $sstr = ucfirst($search);
+               $sstr = str_replace(" ", "_", $sstr);
+               $fuzzymatches = SearchEngine::fuzzyTitles( $sstr, $namespace );
+               $fuzzymatches = array_slice($fuzzymatches, 0, 10);
+               $slen = strlen( $search );
+               $wikitext = "";
+               foreach($fuzzymatches as $res){
+                       $t = str_replace("_", " ", $res[1]);
+                       $tfull = $wgLang->getNsText( $namespace ) . ":$t|$t";
+                       if( $namespace == NS_MAIN )
+                               $tfull = "$t";
+                       $distance = $res[0];
+                       $closeness = (strlen( $search ) - $distance) / strlen( $search );
+                       $percent = intval( $closeness * 100 ) . "%";
+                       $stars = str_repeat("*", ceil(5 * $closeness) );
+                       $wikitext .= "* [[$tfull]] $percent ($stars)\n";        
+               }
+               if( $wikitext ){
+                       if( $namespace != NS_MAIN )
+                               $wikitext = "=== " . $wgLang->getNsText( $namespace ) . " ===\n" . $wikitext;
+                       $wgOut->addWikiText( $wikitext );
+                       return true;
+               }
+               return false;
+       }
+
+       /* static */ function fuzzyTitles( $sstr, $namespace = NS_MAIN ){
+               $span = 0.10; // weed on title length before doing levenshtein.
+               $tolerance = 0.35; // allowed percentage of erronous characters
+               $slen = strlen($sstr);
+               $tolerance_count = ceil($tolerance * $slen);
+               $spanabs = ceil($slen * (1 + $span)) - $slen;
+               # print "Word: $sstr, len = $slen, range = [$min, $max], tolerance_count = $tolerance_count<BR>\n";
+               $result = array();
+               for( $i=0; $i <= $spanabs; $i++ ){
+                       $titles = SearchEngine::getTitlesByLength( $slen + $i, $namespace );
+                       if( $i != 0)
+                               $titles = array_merge($titles, SearchEngine::getTitlesByLength( $slen - $i, $namespace ) );
+                       foreach($titles as $t){
+                               $d = levenshtein($sstr, $t);
+                               if($d < $tolerance_count) 
+                                       $result[] = array($d, $t);
+                               $cnt++;
+                       }
+               }
+               usort($result, "SearchEngine_pcmp");
+               return $result;
+       }
+
+       /* static */ function getTitlesByLength($aLength, $aNamespace = 0){
+               global $wgMemc, $wgDBname;
+
+               // to avoid multiple costly SELECTs in case of no memcached
+               if( $this->all_titles ){ 
+                       if( isset( $this->all_titles[$aLength][$aNamespace] ) ){
+                               return $this->all_titles[$aLength][$aNamespace];
+                       } else {
+                               return array();
+                       }
+               }
+
+               $mkey = "$wgDBname:titlesbylength:$aLength:$aNamespace";
+               $mkeyts = "$wgDBname:titlesbylength:createtime";
+               $ts = $wgMemc->get( $mkeyts );
+               $result = $wgMemc->get( $mkey );
+
+               if( time() - $ts < 3600 ){
+                       // note: in case of insufficient memcached space, we return
+                       // an empty list instead of starting to hit the DB.
+                       return is_array( $result ) ? $result : array();
+               }
+
+               $wgMemc->set( $mkeyts, time() );
+
+               $res = wfQuery("SELECT cur_title, cur_namespace FROM cur", DB_READ);
+               $titles = array(); // length, ns, [titles]
+               while( $obj = wfFetchObject( $res ) ){
+                       $title = $obj->cur_title;
+                       $ns = $obj->cur_namespace;
+                       $len = strlen( $title );
+                       $titles[$len][$ns][] = $title;
+               } 
+               foreach($titles as $length => $length_arr){
+                       foreach($length_arr as $ns => $title_arr){
+                               $mkey = "$wgDBname:titlesbylength:$length:$ns";
+                               $wgMemc->set( $mkey, $title_arr, 3600 * 24 );
+                       }
+               }
+               $this->all_titles = $titles;
+               if( isset( $titles[$aLength][$aNamespace] ) )
+                       return $titles[$aLength][$aNamespace];
+               else
+                       return array();
        }
 }
 
+/* private static */ function SearchEngine_pcmp($a, $b){ return $a[0] - $b[0]; }
+
 ?>