From 7ebf0e431b0689b220d42b258e5ced37323bd564 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Tue, 25 Nov 2008 02:39:06 +0000 Subject: [PATCH] * (bug 5477) Searches for words less than 4 characters now work without requiring customization of MySQL server settings Short words are padded so they now get indexed. Yay! Adapted part of Werdna's patch, with some additional cleanup: * Using 'U00' to pad instead of 'SMALL' to reduce false positives (eg search for "small*" could match "Smallville" and "SMALLc") * Checking server's ft_min_word_len variable to see if we need to do anything. This preserves index compatibility with existing installations which have customized their index length. * Some further cleanup on redundant code -- just toss everything through lc() and be done with it :D * Cleaned out some more evals in zh and yue classes :P * Fixed yue class to call the parent adjustor properly --- RELEASE-NOTES | 2 ++ languages/Language.php | 60 ++++++++++++++++++++++++------- languages/classes/LanguageYue.php | 28 +++++++-------- languages/classes/LanguageZh.php | 10 +++--- 4 files changed, 68 insertions(+), 32 deletions(-) diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 926e04c86e..c5323f2258 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -363,6 +363,8 @@ The following extensions are migrated into MediaWiki 1.14: * Improved scripting safety heuristics on SVG uploads. * (bug 11728) Unify layout of enhanced watchlist/recent changes * (bug 8702) Properly update stats when running nukePage maintenance script +* (bug 5477) Searches for words less than 4 characters now work without + requiring customization of MySQL server settings === API changes in 1.14 === diff --git a/languages/Language.php b/languages/Language.php index 64b31241ae..8e4c5760bd 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1523,25 +1523,61 @@ class Language { return $string; } - # MySQL fulltext index doesn't grok utf-8, so we - # need to fold cases and convert to hex wfProfileIn( __METHOD__ ); - if( function_exists( 'mb_strtolower' ) ) { - $out = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "'U8' . bin2hex( \"$1\" )", - mb_strtolower( $string ) ); - } else { - list( , $wikiLowerChars ) = self::getCaseMaps(); + + // MySQL fulltext index doesn't grok utf-8, so we + // need to fold cases and convert to hex + $out = preg_replace_callback( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + array( $this, 'stripForSearchCallback' ), + $this->lc( $string ) ); + + // And to add insult to injury, the default indexing + // ignores short words... Pad them so we can pass them + // through without reconfiguring the server... + $minLength = $this->minSearchLength(); + if( $minLength > 1 ) { + $n = $minLength-1; $out = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", - $string ); + "/\b(\w{1,$n})\b/", + "$1U800", + $out ); } + wfProfileOut( __METHOD__ ); return $out; } + + /** + * Armor a case-folded UTF-8 string to get through MySQL's + * fulltext search without being mucked up by funny charset + * settings or anything else of the sort. + */ + protected function stripForSearchCallback( $matches ) { + return 'U8' . bin2hex( $matches[1] ); + } + + /** + * Check MySQL server's ft_min_word_len setting so we know + * if we need to pad short words... + */ + protected function minSearchLength() { + if( !isset( $this->minSearchLength ) ) { + $sql = "show global variables like 'ft\\_min\\_word\\_len'"; + $dbr = wfGetDB( DB_SLAVE ); + $result = $dbr->query( $sql ); + $row = $result->fetchObject(); + $result->free(); + + if( $row && $row->Variable_name == 'ft_min_word_len' ) { + $this->minSearchLength = intval( $row->Value ); + } else { + $this->minSearchLength = 0; + } + } + return $this->minSearchLength; + } function convertForSearchResult( $termsArray ) { # some languages, e.g. Chinese, need to do a conversion diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php index fdc227b3e8..fc7f233c4e 100644 --- a/languages/classes/LanguageYue.php +++ b/languages/classes/LanguageYue.php @@ -4,20 +4,18 @@ */ class LanguageYue extends Language { function stripForSearch( $string ) { - # MySQL fulltext index doesn't grok utf-8, so we - # need to fold cases and convert to hex - # we also separate characters as "words" - if( function_exists( 'mb_strtolower' ) ) { - return preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "' U8' . bin2hex( \"$1\" )", - mb_strtolower( $string ) ); - } else { - list( , $wikiLowerChars ) = Language::getCaseMaps(); - return preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", - $string ); - } + wfProfileIn( __METHOD__ ); + + // eventually this should be a word segmentation + // for now just treat each character as a word + // @fixme only do this for Han characters... + $t = preg_replace( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + " $1", $string); + + // Do general case folding and UTF-8 armoring + $t = parent::stripForSearch( $t ); + wfProfileOut( __METHOD__ ); + return $t; } } diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php index 3d162a8e51..093626909b 100644 --- a/languages/classes/LanguageZh.php +++ b/languages/classes/LanguageZh.php @@ -126,14 +126,14 @@ class LanguageZh extends LanguageZh_hans { // word segmentation function stripForSearch( $string ) { - $fname="LanguageZh::stripForSearch"; - wfProfileIn( $fname ); + wfProfileIn( __METHOD__ ); // eventually this should be a word segmentation // for now just treat each character as a word + // @fixme only do this for Han characters... $t = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "' ' .\"$1\"", $string); + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + " $1", $string); //always convert to zh-hans before indexing. it should be //better to use zh-hans for search, since conversion from @@ -142,7 +142,7 @@ class LanguageZh extends LanguageZh_hans { $t = $this->mConverter->autoConvert($t, 'zh-hans'); $t = parent::stripForSearch( $t ); - wfProfileOut( $fname ); + wfProfileOut( __METHOD__ ); return $t; } -- 2.20.1