global $wgContLang;
$lc = SearchEngine::legalSearchChars() . '&#;';
- $t = $wgContLang->stripForSearch( $title );
+ $t = $wgContLang->normalizeForSearch( $title );
$t = preg_replace( "/[^{$lc}]+/", ' ', $t );
$t = $wgContLang->lc( $t );
if( is_array( $temp_terms )) {
$temp_terms = array_unique( array_values( $temp_terms ));
foreach( $temp_terms as $t )
- $q[] = $terms[1] . $wgContLang->stripForSearch( $t );
+ $q[] = $terms[1] . $wgContLang->normalizeForSearch( $t );
}
else
- $q[] = $terms[1] . $wgContLang->stripForSearch( $terms[2] );
+ $q[] = $terms[1] . $wgContLang->normalizeForSearch( $terms[2] );
if (!empty($terms[3])) {
$regexp = preg_quote( $terms[3], '/' );
// fulltext engine.
// For Chinese this also inserts spaces between adjacent Han characters.
$strippedVariants = array_map(
- array( $wgContLang, 'stripForSearch' ),
+ array( $wgContLang, 'normalizeForSearch' ),
$variants );
// Some languages such as Chinese force all variants to a canonical
$stripped = $this->normalizeText( $stripped );
if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
// Hack for Chinese: we need to toss in quotes for
- // multiple-character phrases since stripForSearch()
+ // multiple-character phrases since normalizeForSearch()
// added spaces between them to make word breaks.
$stripped = '"' . trim( $stripped ) . '"';
}
global $wgContLang;
wfProfileIn( __METHOD__ );
+
+ // Some languages such as Chinese require word segmentation
+ $out = $wgContLang->wordSegmentation( $string );
// MySQL fulltext index doesn't grok utf-8, so we
// need to fold cases and convert to hex
$out = preg_replace_callback(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
array( $this, 'stripForSearchCallback' ),
- $wgContLang->lc( $string ) );
+ $wgContLang->lc( $out ) );
// And to add insult to injury, the default indexing
// ignores short words... Pad them so we can pass them
private function escapeTerm($t) {
global $wgContLang;
- $t = $wgContLang->stripForSearch($t);
+ $t = $wgContLang->normalizeForSearch($t);
$t = isset($this->reservedWords[strtoupper($t)]) ? '{'.$t.'}' : $t;
$t = preg_replace('/^"(.*)"$/', '($1)', $t);
$t = preg_replace('/([-&|])/', '\\\\$1', $t);
// fulltext engine.
// For Chinese this also inserts spaces between adjacent Han characters.
$strippedVariants = array_map(
- array( $wgContLang, 'stripForSearch' ),
+ array( $wgContLang, 'normalizeForSearch' ),
$variants );
// Some languages such as Chinese force all variants to a canonical
foreach( $strippedVariants as $stripped ) {
if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
// Hack for Chinese: we need to toss in quotes for
- // multiple-character phrases since stripForSearch()
+ // multiple-character phrases since normalizeForSearch()
// added spaces between them to make word breaks.
$stripped = '"' . trim( $stripped ) . '"';
}
}
# Language-specific strip/conversion
- $text = $wgContLang->stripForSearch( $this->mText );
+ $text = $wgContLang->normalizeForSearch( $this->mText );
wfProfileIn( $fname.'-regexps' );
$text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
function hasWordBreaks() {
return true;
}
+
+ /**
+ * Some languages such as Chinese require word segmentation,
+ * Specify such segmentation when overridden in derived class.
+ *
+ * @param $string String
+ * @return String
+ */
+ function wordSegmentation( $string ) {
+ return $string;
+ }
/**
- * Some languages have special punctuation to strip out.
+ * Some languages have special punctuation need to be normalized.
* Make such changes here.
*
* @param $string String
* @return String
*/
- function stripForSearch( $string, $doStrip = true ) {
+ function normalizeForSearch( $string ) {
return $string;
}
return $string;
}
- protected static function wordSegmentation( $string, $pattern ) {
+ protected static function insertSpace( $string, $pattern ) {
$string = preg_replace( $pattern, " $1 ", $string );
$string = preg_replace( '/ +/', ' ', $string );
return $string;
}
// word segmentation
- function stripForSearch( $string, $doStrip = true, $autoVariant = 'gan-hans' ) {
- // LanguageZh::stripForSearch
- return parent::stripForSearch( $string, $doStrip, $autoVariant );
+ function normalizeForSearch( $string, $autoVariant = 'gan-hans' ) {
+ // LanguageZh::normalizeForSearch
+ return parent::normalizeForSearch( $string, $autoVariant );
}
function convertForSearchResult( $termsArray ) {
* @ingroup Language
*/
class LanguageJa extends Language {
- function stripForSearch( $string, $doStrip = true ) {
+ function wordSegmentation( $string ) {
+ // Strip known punctuation ?
+ // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
- $s = $string;
-
- if ( $doStrip == true ) {
- // Strip known punctuation ?
- // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
+ // Space strings of like hiragana/katakana/kanji
+ $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
+ $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
+ $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
+ . '|[\xe4-\xe8][\x80-\xbf]{2}'
+ . '|\xe9[\x80-\xa5][\x80-\xbf]'
+ . '|\xe9\xa6[\x80-\x99])';
+ # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
+ $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+ $s = self::insertSpace( $string, $reg );
+ return $s;
+ }
- // Space strings of like hiragana/katakana/kanji
- $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
- $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
- $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
- . '|[\xe4-\xe8][\x80-\xbf]{2}'
- . '|\xe9[\x80-\xa5][\x80-\xbf]'
- . '|\xe9\xa6[\x80-\x99])';
- # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
- $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
- $s = self::wordSegmentation( $s, $reg );
- }
+ function normalizeForSearch( $string ) {
// Double-width roman characters
- $s = self::convertDoubleWidth( $s );
+ $s = self::convertDoubleWidth( $string );
# Do general case folding and UTF-8 armoring
- return parent::stripForSearch( $s, $doStrip );
+ return parent::normalizeForSearch( $s );
}
# Italic is not appropriate for Japanese script
* @ingroup Language
*/
class LanguageYue extends Language {
- function stripForSearch( $string, $doStrip = true ) {
+ function hasWordBreaks() {
+ return false;
+ }
+
+ /**
+ * Eventually this should be a word segmentation;
+ * for now just treat each character as a word.
+ * @todo Fixme: only do this for Han characters...
+ */
+ function wordSegmentation( $string ) {
+ $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+ $s = self::insertSpace( $string, $reg );
+ return $s;
+ }
+
+ function normalizeForSearch( $string ) {
wfProfileIn( __METHOD__ );
// Double-width roman characters
$s = self::convertDoubleWidth( $string );
-
- if ( $doStrip == true ) {
- // eventually this should be a word segmentation;
- // for now just treat each character as a word.
- // @todo Fixme: only do this for Han characters...
- $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
- $s = self::wordSegmentation( $s, $reg );
- }
-
$s = trim( $s );
+ $s = parent::normalizeForSearch( $s );
- // Do general case folding and UTF-8 armoring
- $s = parent::stripForSearch( $s, $doStrip );
wfProfileOut( __METHOD__ );
return $s;
}
"\"$1\"", $text);
}
- // word segmentation
- function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
+ /**
+ * word segmentation
+ */
+ function wordSegmentation( $string ) {
+ // LanguageZh_hans::wordSegmentation
+ $s = parent::wordSegmentation( $string );
+ return $s;
+ }
+
+ /**
+ * auto convert to zh-hans and normalize special characters.
+ *
+ * @param $string String
+ * @param $autoVariant String, default to 'zh-hans'
+ * @return String
+ */
+ function normalizeForSearch( $string, $autoVariant = 'zh-hans' ) {
wfProfileIn( __METHOD__ );
// always convert to zh-hans before indexing. it should be
// Traditional to Simplified is less ambiguous than the
// other way around
$s = $this->mConverter->autoConvert( $string, $autoVariant );
- // LanguageZh_hans::stripForSearch
- $s = parent::stripForSearch( $s, $doStrip );
+ // LanguageZh_hans::normalizeForSearch
+ $s = parent::normalizeForSearch( $s );
wfProfileOut( __METHOD__ );
return $s;
function hasWordBreaks() {
return false;
}
-
- function stripForSearch( $string, $doStrip = true ) {
+
+ /**
+ * Eventually this should be a word segmentation;
+ * for now just treat each character as a word.
+ * @todo Fixme: only do this for Han characters...
+ */
+ function wordSegmentation( $string ) {
+ $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+ $s = self::insertSpace( $string, $reg );
+ return $s;
+ }
+
+ function normalizeForSearch( $string ) {
wfProfileIn( __METHOD__ );
// Double-width roman characters
$s = self::convertDoubleWidth( $string );
-
- if ( $doStrip == true ) {
- // Eventually this should be a word segmentation;
- // for now just treat each character as a word.
- // @todo Fixme: only do this for Han characters...
- $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
- $s = self::wordSegmentation( $s, $reg );
- }
-
$s = trim( $s );
+ $s = parent::normalizeForSearch( $s );
- // Do general case folding and UTF-8 armoring
- $s = parent::stripForSearch( $s, $doStrip );
wfProfileOut( __METHOD__ );
return $s;
}