1. Changed the conditions, not only for LuceneSearch, but also more commonly to others.
2. Reduced code duplication.
* @param $string String
* @return String
*/
- function stripForSearch( $string ) {
- global $wgDBtype, $wgSearchType;
- if ( $wgDBtype != 'mysql' || $wgSearchType == 'LuceneSearch' ) {
+ function stripForSearch( $string, $doStrip = true ) {
+ global $wgDBtype;
+ if ( $wgDBtype != 'mysql' || $doStrip == false ) {
return $string;
}
return $this->minSearchLength;
}
+ /**
+ * convert double-width roman characters to single-width.
+ * range: ff00-ff5f ~= 0020-007f
+ */
+ protected static function convertDoubleWidth( $string ) {
+ $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string );
+ $string = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string );
+ return $string;
+ }
+
+ protected static function wordSegmentation( $string, $pattern ) {
+ $string = preg_replace( $pattern, " $1 ", $string );
+ $string = preg_replace( '/ +/', ' ', $string );
+ return $string;
+ }
+
function convertForSearchResult( $termsArray ) {
# some languages, e.g. Chinese, need to do a conversion
# in order for search results to be displayed correctly
}
// word segmentation
- function stripForSearch( $string ) {
- wfProfileIn( __METHOD__ );
- global $wgSearchType;
-
- // always convert to gan-hans before indexing. it should be
- // better to use gan-hans for search, since conversion from
- // Traditional to Simplified is less ambiguous than the
- // other way around
- $s = $this->mConverter->autoConvert($string, 'gan-hans');
-
- // Double-width roman characters: ff00-ff5f ~= 0020-007f
- $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
- $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
- if ( $wgSearchType != 'LuceneSearch' ) {
- // eventually this should be a word segmentation;
- // for now just treat each character as a word.
- // Not for LuceneSearch, because LSearch will
- // split the text to words itself.
- // @todo Fixme: only do this for Han characters...
- $s = preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
- " $1 ", $s);
- $s = preg_replace( '/ +/', ' ', $s );
- }
-
- $s = trim( $s );
-
- // Do general case folding and UTF-8 armoring
- $s = parent::stripForSearch( $s );
- wfProfileOut( __METHOD__ );
- return $s;
-
+ function stripForSearch( $string, $doStrip = true ) {
+ // LanguageZh::stripForSearch
+ return parent::stripForSearch( $string, $doStrip, 'gan-hans' );
}
function convertForSearchResult( $termsArray ) {
$terms = implode( '|', $termsArray );
+ $terms = self::convertDoubleWidth( $terms );
$terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
$ret = array_unique( explode('|', $terms) );
return $ret;
* @ingroup Language
*/
class LanguageJa extends Language {
- function stripForSearch( $string ) {
- # MySQL fulltext index doesn't grok utf-8, so we
- # need to fold cases and convert to hex
+ function stripForSearch( $string, $doStrip = true ) {
+
$s = $string;
- # not for LuceneSearch, because LSearch
- # will split the text to words itself
- if ( $wgSearchType != 'LuceneSearch' ) {
- # Strip known punctuation ?
- #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
+ if ( $doStrip == true ) {
+ // Strip known punctuation ?
+ // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
- # Space strings of like hiragana/katakana/kanji
+ // Space strings of like hiragana/katakana/kanji
$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
. '|\xe9[\x80-\xa5][\x80-\xbf]'
. '|\xe9\xa6[\x80-\x99])';
# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
- $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+ $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+ $s = self::wordSegmentation( $s, $reg );
}
- # Double-width roman characters: ff00-ff5f ~= 0020-007f
- $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
- $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+ // Double-width roman characters
+ $s = self::convertDoubleWidth( $s );
# Do general case folding and UTF-8 armoring
- return parent::stripForSearch( $s );
+ return parent::stripForSearch( $s, $doStrip );
}
# Italic is not appropriate for Japanese script
* @ingroup Language
*/
class LanguageYue extends Language {
- function stripForSearch( $string ) {
+ function stripForSearch( $string, $doStrip = true ) {
wfProfileIn( __METHOD__ );
- global $wgSearchType;
- $s = $string;
+ // Double-width roman characters
+ $s = self::convertDoubleWidth( $string );
- // Double-width roman characters: ff00-ff5f ~= 0020-007f
- $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
- $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
- if ( $wgSearchType != 'LuceneSearch' ) {
+ if ( $doStrip == true ) {
// eventually this should be a word segmentation;
// for now just treat each character as a word.
- // Not for LuceneSearch, because LSearch will
- // split the text to words itself.
// @todo Fixme: only do this for Han characters...
- $s = preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
- " $1 ", $s);
- $s = preg_replace( '/ +/', ' ', $s );
+ $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+ $s = self::wordSegmentation( $s, $reg );
}
$s = trim( $s );
// Do general case folding and UTF-8 armoring
- $s = parent::stripForSearch( $s );
+ $s = parent::stripForSearch( $s, $doStrip );
wfProfileOut( __METHOD__ );
return $s;
}
}
// word segmentation
- function stripForSearch( $string ) {
+ function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
wfProfileIn( __METHOD__ );
// always convert to zh-hans before indexing. it should be
// better to use zh-hans for search, since conversion from
// Traditional to Simplified is less ambiguous than the
// other way around
- $s = $this->mConverter->autoConvert( $string, 'zh-hans' );
- $s = parent::stripForSearch( $s );
+ $s = $this->mConverter->autoConvert( $string, $autoVariant );
+ // LanguageZh_hans::stripForSearch
+ $s = parent::stripForSearch( $s, $doStrip );
wfProfileOut( __METHOD__ );
return $s;
function convertForSearchResult( $termsArray ) {
$terms = implode( '|', $termsArray );
+ $terms = self::convertDoubleWidth( $terms );
$terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
$ret = array_unique( explode('|', $terms) );
return $ret;
return false;
}
- function stripForSearch( $string ) {
+ function stripForSearch( $string, $doStrip = true ) {
wfProfileIn( __METHOD__ );
- global $wgSearchType;
- $s = $string;
+ // Double-width roman characters
+ $s = self::convertDoubleWidth( $string );
- // Double-width roman characters: ff00-ff5f ~= 0020-007f
- $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
- $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
- if ( $wgSearchType != 'LuceneSearch' ) {
+ if ( $doStrip == true ) {
// Eventually this should be a word segmentation;
// for now just treat each character as a word.
- // Not for LuceneSearch, because LSearch will
- // split the text to words itself.
// @todo Fixme: only do this for Han characters...
- $s = preg_replace(
- "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
- " $1 ", $s);
- $s = preg_replace( '/ +/', ' ', $s );
+ $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+ $s = self::wordSegmentation( $s, $reg );
}
$s = trim( $s );
// Do general case folding and UTF-8 armoring
- $s = parent::stripForSearch( $s );
+ $s = parent::stripForSearch( $s, $doStrip );
wfProfileOut( __METHOD__ );
return $s;
}
-}
+}
\ No newline at end of file