X-Git-Url: https://git.cyclocoop.org/?a=blobdiff_plain;f=includes%2FSearchEngine.php;h=94162cff3248baf021ee408128e66e779c99581f;hb=eebe7e3ef58635e18d08bc6ee9b2c53d57370112;hp=ffffb9bb853f8861e1b7aee5b1eb84c22f50f9b9;hpb=c5bbab5c6efab4d632203a9a14c9e3b39b53b0df;p=lhc%2Fweb%2Fwiklou.git
diff --git a/includes/SearchEngine.php b/includes/SearchEngine.php
index ffffb9bb85..94162cff32 100644
--- a/includes/SearchEngine.php
+++ b/includes/SearchEngine.php
@@ -1,72 +1,82 @@
-
+rawText = trim( $text );
- function SearchEngine( $text )
- {
# We display the query, so let's strip it for safety
#
global $wgDBmysql4;
$lc = SearchEngine::legalSearchChars() . "()";
- if( $wgDBmysql4 ) $lc .= "\"~<>*+-";
- $this->mUsertext = trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
- $this->mSearchterms = array();
+ if( $wgDBmysql4 ) {
+ $lc .= "\"~<>*+-";
+ }
+ $this->filteredText = trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
+ $this->searchTerms = array();
+ $this->strictMatching = true; # Google-style, add '+' on all terms
+
+ $this->db =& wfGetDB( DB_SLAVE );
}
- function queryNamespaces()
- {
+ # Return a partial WHERE clause to limit the search to the given namespaces
+ function queryNamespaces() {
$namespaces = implode( ",", $this->namespacesToSearch );
if ($namespaces == "") {
$namespaces = "0";
}
return "AND cur_namespace IN (" . $namespaces . ")";
- #return "1";
}
- function searchRedirects()
- {
- if ( $this->doSearchRedirects ) return "";
- return "AND cur_is_redirect=0 ";
+ # Return a partial WHERE clause to include or exclude redirects from results
+ function searchRedirects() {
+ if ( $this->doSearchRedirects ) {
+ return "";
+ } else {
+ return "AND cur_is_redirect=0 ";
+ }
}
/* private */ function initNamespaceCheckbox( $i )
{
global $wgUser, $wgNamespacesToBeSearchedDefault;
-
if ($wgUser->getID()) {
// User is logged in so we retrieve his default namespaces
return $wgUser->getOption( "searchNs".$i );
- }
- else {
+ } else {
// User is not logged in so we give him the global default namespaces
- return $wgNamespacesToBeSearchedDefault[ $i ];
+ return !empty($wgNamespacesToBeSearchedDefault[ $i ]);
}
}
# Display the "power search" footer. Does not actually perform the search,
# that is done by showResults()
- function powersearch()
- {
- global $wgUser, $wgOut, $wgLang, $wgTitle;
-
- $search = $_REQUEST['search'];
- $searchx = $_REQUEST['searchx'];
- $listredirs = $_REQUEST['redirs'];
+ function powersearch() {
+ global $wgUser, $wgOut, $wgLang, $wgTitle, $wgRequest;
+ $sk =& $wgUser->getSkin();
+
+ $search = $this->rawText;
+ $searchx = $wgRequest->getVal( 'searchx' );
+ $listredirs = $wgRequest->getVal( 'redirs' );
$ret = wfMsg("powersearchtext"); # Text to be returned
$tempText = ""; # Temporary text, for substitution into $ret
if( isset( $_REQUEST["searchx"] ) ) {
- $this->addtoquery["searchx"] = "1";
+ $this->addToQuery["searchx"] = "1";
}
# Do namespace checkboxes
@@ -84,13 +94,13 @@ class SearchEngine {
if ( !isset( $searchx ) ) {
$checkboxValue = $this->initNamespaceCheckbox( $i );
} else {
- $checkboxValue = $_REQUEST[$formVar];
+ $checkboxValue = $wgRequest->getVal( $formVar );
}
$checked = "";
if ( $checkboxValue == 1 ) {
- $checked = " checked";
- $this->addtoquery["ns{$i}"] = 1;
+ $checked = " checked='checked'";
+ $this->addToQuery["ns{$i}"] = 1;
array_push( $this->namespacesToSearch, $i );
}
$name = str_replace( "_", " ", $namespaces[$i] );
@@ -101,8 +111,8 @@ class SearchEngine {
if ( $tempText !== "" ) {
$tempText .= " ";
}
- $tempText .= "{$name}\n";
+ $tempText .= "{$name}\n";
}
$ret = str_replace ( "$1", $tempText, $ret );
@@ -110,26 +120,27 @@ class SearchEngine {
$checked = "";
if ( $listredirs == 1 ) {
- $this->addtoquery["redirs"] = 1;
- $checked = " checked";
+ $this->addToQuery["redirs"] = 1;
+ $checked = " checked='checked'";
}
- $tempText = "\n";
+ $tempText = "\n";
$ret = str_replace( "$2", $tempText, $ret );
# Search field
- $tempText = "\n";
+ $tempText = "\n";
$ret = str_replace( "$3", $tempText, $ret );
# Searchx button
- $tempText = "\n";
+ $tempText = "\n";
$ret = str_replace( "$9", $tempText, $ret );
- $ret = "
\n
" . wfMsg( "badquerytext" ) ); + $wgOut->addWikiText( wfMsg( "searchresulttext" ) ); + + if ( !$this->parseQuery() ) { + $wgOut->addWikiText( + "==" . wfMsg( "badquery" ) . "==\n" . + wfMsg( "badquerytext" ) ); return; } list( $limit, $offset ) = wfCheckLimits( 20, "searchlimit" ); + + if ( $wgDisableTextSearch ) { + $wgOut->addHTML( wfMsg( "searchdisabled" ) ); + $wgOut->addHTML( wfMsg( "googlesearch", + htmlspecialchars( $this->rawText ), + htmlspecialchars( $wgInputEncoding ) ) ); + return; + } - $searchnamespaces = $this->queryNamespaces(); - $redircond = $this->searchRedirects(); + $titleMatches = $this->getMatches( $this->titleCond, $limit, $offset ); + $textMatches = $this->getMatches( $this->textCond, $limit, $offset ); - if ( $wgDisableTextSearch ) { - $wgOut->addHTML( wfMsg( "searchdisabled", htmlspecialchars( $search ), $wgInputEncoding ) ); + $sk = $wgUser->getSkin(); + + $num = count( $titleMatches ) + count( $textMatches ); + if ( $num >= $limit ) { + $top = wfShowingResults( $offset, $limit ); } else { - $sql = "SELECT cur_id,cur_namespace,cur_title," . - "cur_text FROM cur,searchindex " . - "WHERE cur_id=si_page AND {$this->mTitlecond} " . - "{$searchnamespaces} {$redircond}" . - "LIMIT {$offset}, {$limit}"; - $res1 = wfQuery( $sql, DB_READ, $fname ); - $num = wfNumRows($res1); - - $sk = $wgUser->getSkin(); - $text = wfMsg( "searchresulttext", $sk->makeKnownLink( - wfMsg( "searchhelppage" ), wfMsg( "searchingwikipedia" ) ) ); - $wgOut->addHTML( $text ); - - $this->parseQuery(); - if ( "" == $this->mTitlecond || "" == $this->mTextcond ) { - $wgOut->addHTML( "
" . wfMsg( "badquerytext" ) ); - return; - } - list( $limit, $offset ) = wfCheckLimits( 20, "searchlimit" ); - - $searchnamespaces = $this->queryNamespaces(); - $redircond = $this->searchRedirects(); - - $sql = "SELECT cur_id,cur_namespace,cur_title," . - "cur_text FROM cur,searchindex " . - "WHERE cur_id=si_page AND {$this->mTitlecond} " . - "{$searchnamespaces} {$redircond}" . - "LIMIT {$offset}, {$limit}"; - $res1 = wfQuery( $sql, DB_READ, $fname ); - $num = wfNumRows($res1); - - $sql = "SELECT cur_id,cur_namespace,cur_title," . - "cur_text FROM cur,searchindex " . - "WHERE cur_id=si_page AND {$this->mTextcond} " . - "{$searchnamespaces} {$redircond} " . - "LIMIT {$offset}, {$limit}"; - $res2 = wfQuery( $sql, DB_READ, $fname ); - $num = $num + wfNumRows($res2); - - if ( $num == $limit ) { - $top = wfShowingResults( $offset, $limit); - } else { - $top = wfShowingResultsNum( $offset, $limit, $num ); - } - $wgOut->addHTML( "
{$top}\n" );
-
- # For powersearch
-
- $a2l = "" ;
- $akk = array_keys( $this->addtoquery ) ;
- foreach ( $akk AS $ak ) {
- $a2l .= "&{$ak}={$this->addtoquery[$ak]}" ;
- }
-
- $sl = wfViewPrevNext( $offset, $limit, "",
- "search=" . wfUrlencode( $this->mUsertext ) . $a2l );
- $wgOut->addHTML( "
{$sl}\n" );
-
- $foundsome = false;
-
- if ( 0 == wfNumRows( $res1 ) ) {
- $wgOut->addHTML( "
{$top}
\n" ); - if ( 0 == wfNumRows( $res2 ) ) { - $wgOut->addHTML( "" . wfMsg( "nonefound" ) . "\n" ); - } - $wgOut->addHTML( "
{$sl}\n" );
- $wgOut->addHTML( $powersearch );
+ # For powersearch
+ $a2l = "";
+ $akk = array_keys( $this->addToQuery );
+ foreach ( $akk AS $ak ) {
+ $a2l .= "&{$ak}={$this->addToQuery[$ak]}" ;
}
+
+ $prevnext = wfViewPrevNext( $offset, $limit, "",
+ "search=" . wfUrlencode( $this->filteredText ) . $a2l );
+ $wgOut->addHTML( "
{$prevnext}\n" );
+
+ $foundsome = $this->showMatches( $titleMatches, $offset, "notitlematches", "titlematches" )
+ || $this->showMatches( $textMatches, $offset, "notextmatches", "textmatches" );
+
+ if ( !$foundsome ) {
+ $wgOut->addWikiText( wfMsg( "nonefound" ) );
+ }
+ $wgOut->addHTML( "
{$prevnext}
\n" ); + $wgOut->addHTML( $powersearch ); } - function legalSearchChars() - { + function legalSearchChars() { $lc = "A-Za-z_'0-9\\x80-\\xFF\\-"; return $lc; } - function parseQuery() - { - global $wgDBminWordLen, $wgLang, $wgDBmysql4; - + function parseQuery() { + global $wgDBmysql4; if( $wgDBmysql4 ) { # Use cleaner boolean search if available return $this->parseQuery4(); + } else { + # Fall back to ugly hack with multiple search clauses + return $this->parseQuery3(); } + } + + function parseQuery3() { + global $wgDBminWordLen, $wgLang; + + # on non mysql4 database: get list of words we don't want to search for + require_once( "FulltextStoplist.php" ); $lc = SearchEngine::legalSearchChars() . "()"; - $q = preg_replace( "/([()])/", " \\1 ", $this->mUsertext ); + $q = preg_replace( "/([()])/", " \\1 ", $this->filteredText ); $q = preg_replace( "/\\s+/", " ", $q ); - $w = explode( " ", strtolower( trim( $q ) ) ); + $w = explode( " ", trim( $q ) ); $last = $cond = ""; foreach ( $w as $word ) { @@ -308,42 +264,108 @@ class SearchEngine { } else { if ( "" != $last ) { $cond .= " AND"; } $cond .= " (MATCH (##field##) AGAINST ('" . - wfStrencode( $word ). "'))"; + $this->db->strencode( $word ). "'))"; $last = $word; - array_push( $this->mSearchterms, "\\b" . $word . "\\b" ); + array_push( $this->searchTerms, "\\b" . $word . "\\b" ); } } - if ( 0 == count( $this->mSearchterms ) ) { return; } + if ( 0 == count( $this->searchTerms ) ) { + return MW_SEARCH_BAD_QUERY; + } - $this->mTitlecond = "(" . str_replace( "##field##", + $this->titleCond = "(" . str_replace( "##field##", "si_title", $cond ) . " )"; - $this->mTextcond = "(" . str_replace( "##field##", + $this->textCond = "(" . str_replace( "##field##", "si_text", $cond ) . " AND (cur_is_redirect=0) )"; + + return MW_SEARCH_OK; } - function parseQuery4() - { - # FIXME: not ready yet! Do not use. - + function parseQuery4() { global $wgLang; $lc = SearchEngine::legalSearchChars(); - #$q = preg_replace( "/([+-]?)([$lc]+)/e", - # "\"$1\" . \$wgLang->stripForSearch(\"$2\")", - # $this->mUsertext ); + $searchon = ""; + $this->searchTerms = array(); + + # FIXME: This doesn't handle parenthetical expressions. + if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', + $this->filteredText, $m, PREG_SET_ORDER ) ) { + foreach( $m as $terms ) { + if( $searchon !== "" ) $searchon .= " "; + if( $this->strictMatching && ($terms[1] == "") ) { + $terms[1] = "+"; + } + $searchon .= $terms[1] . $wgLang->stripForSearch( $terms[2] ); + if( $terms[3] ) { + $regexp = preg_quote( $terms[3] ); + if( $terms[4] ) $regexp .= "[0-9A-Za-z_]+"; + } else { + $regexp = preg_quote( str_replace( '"', '', $terms[2] ) ); + } + $this->searchTerms[] = $regexp; + } + wfDebug( "Would search with '$searchon'\n" ); + wfDebug( "Match with /\b" . implode( '\b|\b', $this->searchTerms ) . "\b/\n" ); + } else { + wfDebug( "Can't understand search query '{$this->filteredText}'\n" ); + } - $q = $this->mUsertext; - $qq = wfStrencode( $wgLang->stripForSearch( $q ) ); - $this->mSearchterms = preg_split( '/\s+/', $q ); - $this->mTitlecond = " MATCH(si_title) AGAINST('$qq' IN BOOLEAN MODE)"; - $this->mTextcond = " (MATCH(si_text) AGAINST('$qq' IN BOOLEAN MODE) AND cur_is_redirect=0)"; + $searchon = $this->db->strencode( $searchon ); + $this->titleCond = " MATCH(si_title) AGAINST('$searchon' IN BOOLEAN MODE)"; + $this->textCond = " (MATCH(si_text) AGAINST('$searchon' IN BOOLEAN MODE) AND cur_is_redirect=0)"; + return MW_SEARCH_OK; } - function showHit( $row ) - { - global $wgUser, $wgOut; + function &getMatches( $cond, $limit, $offset = 0 ) { + $searchindex = $this->db->tableName( 'searchindex' ); + $cur = $this->db->tableName( 'cur' ); + $searchnamespaces = $this->queryNamespaces(); + $redircond = $this->searchRedirects(); + + $sql = "SELECT cur_id,cur_namespace,cur_title," . + "cur_text FROM $cur,$searchindex " . + "WHERE cur_id=si_page AND {$cond} " . + "{$searchnamespaces} {$redircond} " . + $this->db->limitResult( $limit, $offset ); + + $res = $this->db->query( $sql, "SearchEngine::getMatches" ); + $matches = array(); + while ( $row = $this->db->fetchObject( $res ) ) { + $matches[] = $row; + } + $this->db->freeResult( $res ); + + return $matches; + } + + function showMatches( &$matches, $offset, $msgEmpty, $msgFound ) { + global $wgOut; + if ( 0 == count( $matches ) ) { + $wgOut->addHTML( "" . wfMsg("nogomatch", $editurl ) . "
\n" ); + + # Try a fuzzy title search + $anyhit = false; + global $wgDisableFuzzySearch; + if(! $wgDisableFuzzySearch ){ + foreach( array(NS_MAIN, NS_WP, NS_USER, NS_IMAGE, NS_MEDIAWIKI) as $namespace){ + $anyhit |= SearchEngine::doFuzzyTitleSearch( $this->rawText, $namespace ); + } + } + + if( ! $anyhit ){ + return $this->showResults(); + } + } - # Now try capitalized string - # - $t = Title::newFromText( ucwords( strtolower( $search ) ) ); - if ( 0 != $t->getArticleID() ) { - $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) ); - return; + /* static */ function doFuzzyTitleSearch( $search, $namespace ){ + global $wgLang, $wgOut; + + $this->setupPage(); + + $sstr = ucfirst($search); + $sstr = str_replace(" ", "_", $sstr); + $fuzzymatches = SearchEngine::fuzzyTitles( $sstr, $namespace ); + $fuzzymatches = array_slice($fuzzymatches, 0, 10); + $slen = strlen( $search ); + $wikitext = ""; + foreach($fuzzymatches as $res){ + $t = str_replace("_", " ", $res[1]); + $tfull = $wgLang->getNsText( $namespace ) . ":$t|$t"; + if( $namespace == NS_MAIN ) + $tfull = "$t"; + $distance = $res[0]; + $closeness = (strlen( $search ) - $distance) / strlen( $search ); + $percent = intval( $closeness * 100 ) . "%"; + $stars = str_repeat("*", ceil(5 * $closeness) ); + $wikitext .= "* [[$tfull]] $percent ($stars)\n"; } + if( $wikitext ){ + if( $namespace != NS_MAIN ) + $wikitext = "=== " . $wgLang->getNsText( $namespace ) . " ===\n" . $wikitext; + $wgOut->addWikiText( $wikitext ); + return true; + } + return false; + } - # Now try all upper case - # - $t = Title::newFromText( strtoupper( $search ) ); - if ( 0 != $t->getArticleID() ) { - $wgOut->redirect( wfLocalUrl( $t->getPrefixedURL() ) ); - return; + /* static */ function fuzzyTitles( $sstr, $namespace = NS_MAIN ){ + $span = 0.10; // weed on title length before doing levenshtein. + $tolerance = 0.35; // allowed percentage of erronous characters + $slen = strlen($sstr); + $tolerance_count = ceil($tolerance * $slen); + $spanabs = ceil($slen * (1 + $span)) - $slen; + # print "Word: $sstr, len = $slen, range = [$min, $max], tolerance_count = $tolerance_count" ); - $this->showResults(); + + $mkey = "$wgDBname:titlesbylength:$aLength:$aNamespace"; + $mkeyts = "$wgDBname:titlesbylength:createtime"; + $ts = $wgMemc->get( $mkeyts ); + $result = $wgMemc->get( $mkey ); + + if( time() - $ts < 3600 ){ + // note: in case of insufficient memcached space, we return + // an empty list instead of starting to hit the DB. + return is_array( $result ) ? $result : array(); + } + + $wgMemc->set( $mkeyts, time() ); + + $res = $this->db->select( 'cur', array( 'cur_title', 'cur_namespace' ), false, $fname ); + $titles = array(); // length, ns, [titles] + while( $obj = $this->db->fetchObject( $res ) ){ + $title = $obj->cur_title; + $ns = $obj->cur_namespace; + $len = strlen( $title ); + $titles[$len][$ns][] = $title; + } + foreach($titles as $length => $length_arr){ + foreach($length_arr as $ns => $title_arr){ + $mkey = "$wgDBname:titlesbylength:$length:$ns"; + $wgMemc->set( $mkey, $title_arr, 3600 * 24 ); + } + } + $this->allTitles = $titles; + if( isset( $titles[$aLength][$aNamespace] ) ) + return $titles[$aLength][$aNamespace]; + else + return array(); } } +/* private static */ function SearchEngine_pcmp($a, $b){ return $a[0] - $b[0]; } + ?>