return $string;
}
- # MySQL fulltext index doesn't grok utf-8, so we
- # need to fold cases and convert to hex
wfProfileIn( __METHOD__ );
- if( function_exists( 'mb_strtolower' ) ) {
- $out = preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "'U8' . bin2hex( \"$1\" )",
- mb_strtolower( $string ) );
- } else {
- list( , $wikiLowerChars ) = self::getCaseMaps();
+
+ // MySQL fulltext index doesn't grok utf-8, so we
+ // need to fold cases and convert to hex
+ $out = preg_replace_callback(
+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+ array( $this, 'stripForSearchCallback' ),
+ $this->lc( $string ) );
+
+ // And to add insult to injury, the default indexing
+ // ignores short words... Pad them so we can pass them
+ // through without reconfiguring the server...
+ $minLength = $this->minSearchLength();
+ if( $minLength > 1 ) {
+ $n = $minLength-1;
$out = preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
- $string );
+ "/\b(\w{1,$n})\b/",
+ "$1U800",
+ $out );
}
+
wfProfileOut( __METHOD__ );
return $out;
}
+
+ /**
+ * Armor a case-folded UTF-8 string to get through MySQL's
+ * fulltext search without being mucked up by funny charset
+ * settings or anything else of the sort.
+ */
+ protected function stripForSearchCallback( $matches ) {
+ return 'U8' . bin2hex( $matches[1] );
+ }
+
+ /**
+ * Check MySQL server's ft_min_word_len setting so we know
+ * if we need to pad short words...
+ */
+ protected function minSearchLength() {
+ if( !isset( $this->minSearchLength ) ) {
+ $sql = "show global variables like 'ft\\_min\\_word\\_len'";
+ $dbr = wfGetDB( DB_SLAVE );
+ $result = $dbr->query( $sql );
+ $row = $result->fetchObject();
+ $result->free();
+
+ if( $row && $row->Variable_name == 'ft_min_word_len' ) {
+ $this->minSearchLength = intval( $row->Value );
+ } else {
+ $this->minSearchLength = 0;
+ }
+ }
+ return $this->minSearchLength;
+ }
function convertForSearchResult( $termsArray ) {
# some languages, e.g. Chinese, need to do a conversion
*/
class LanguageYue extends Language {
function stripForSearch( $string ) {
- # MySQL fulltext index doesn't grok utf-8, so we
- # need to fold cases and convert to hex
- # we also separate characters as "words"
- if( function_exists( 'mb_strtolower' ) ) {
- return preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "' U8' . bin2hex( \"$1\" )",
- mb_strtolower( $string ) );
- } else {
- list( , $wikiLowerChars ) = Language::getCaseMaps();
- return preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
- $string );
- }
+ wfProfileIn( __METHOD__ );
+
+ // eventually this should be a word segmentation
+ // for now just treat each character as a word
+ // @fixme only do this for Han characters...
+ $t = preg_replace(
+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+ " $1", $string);
+
+ // Do general case folding and UTF-8 armoring
+ $t = parent::stripForSearch( $t );
+ wfProfileOut( __METHOD__ );
+ return $t;
}
}
// word segmentation
function stripForSearch( $string ) {
- $fname="LanguageZh::stripForSearch";
- wfProfileIn( $fname );
+ wfProfileIn( __METHOD__ );
// eventually this should be a word segmentation
// for now just treat each character as a word
+ // @fixme only do this for Han characters...
$t = preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
- "' ' .\"$1\"", $string);
+ "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+ " $1", $string);
//always convert to zh-hans before indexing. it should be
//better to use zh-hans for search, since conversion from
$t = $this->mConverter->autoConvert($t, 'zh-hans');
$t = parent::stripForSearch( $t );
- wfProfileOut( $fname );
+ wfProfileOut( __METHOD__ );
return $t;
}