From: Philip Tzou Date: Thu, 7 Jan 2010 04:50:32 +0000 (+0000) Subject: follow-up r60743. X-Git-Tag: 1.31.0-rc.0~38374 X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/banques/ajouter.php?a=commitdiff_plain;h=8bbfbf562874f50faa24ef523c3a507b30584dfc;p=lhc%2Fweb%2Fwiklou.git follow-up r60743. 1. Changed the conditions, not only for LuceneSearch, but also more commonly to others. 2. Reduced code duplication. --- diff --git a/languages/Language.php b/languages/Language.php index 0ee04847fc..6aa9c9b2af 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1695,9 +1695,9 @@ class Language { * @param $string String * @return String */ - function stripForSearch( $string ) { - global $wgDBtype, $wgSearchType; - if ( $wgDBtype != 'mysql' || $wgSearchType == 'LuceneSearch' ) { + function stripForSearch( $string, $doStrip = true ) { + global $wgDBtype; + if ( $wgDBtype != 'mysql' || $doStrip == false ) { return $string; } @@ -1767,6 +1767,22 @@ class Language { return $this->minSearchLength; } + /** + * convert double-width roman characters to single-width. + * range: ff00-ff5f ~= 0020-007f + */ + protected static function convertDoubleWidth( $string ) { + $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string ); + $string = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string ); + return $string; + } + + protected static function wordSegmentation( $string, $pattern ) { + $string = preg_replace( $pattern, " $1 ", $string ); + $string = preg_replace( '/ +/', ' ', $string ); + return $string; + } + function convertForSearchResult( $termsArray ) { # some languages, e.g. Chinese, need to do a conversion # in order for search results to be displayed correctly diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php index 9bca7d6794..b10d720a9b 100644 --- a/languages/classes/LanguageGan.php +++ b/languages/classes/LanguageGan.php @@ -137,43 +137,14 @@ class LanguageGan extends LanguageZh { } // word segmentation - function stripForSearch( $string ) { - wfProfileIn( __METHOD__ ); - global $wgSearchType; - - // always convert to gan-hans before indexing. it should be - // better to use gan-hans for search, since conversion from - // Traditional to Simplified is less ambiguous than the - // other way around - $s = $this->mConverter->autoConvert($string, 'gan-hans'); - - // Double-width roman characters: ff00-ff5f ~= 0020-007f - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); - - if ( $wgSearchType != 'LuceneSearch' ) { - // eventually this should be a word segmentation; - // for now just treat each character as a word. - // Not for LuceneSearch, because LSearch will - // split the text to words itself. - // @todo Fixme: only do this for Han characters... - $s = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - " $1 ", $s); - $s = preg_replace( '/ +/', ' ', $s ); - } - - $s = trim( $s ); - - // Do general case folding and UTF-8 armoring - $s = parent::stripForSearch( $s ); - wfProfileOut( __METHOD__ ); - return $s; - + function stripForSearch( $string, $doStrip = true ) { + // LanguageZh::stripForSearch + return parent::stripForSearch( $string, $doStrip, 'gan-hans' ); } function convertForSearchResult( $termsArray ) { $terms = implode( '|', $termsArray ); + $terms = self::convertDoubleWidth( $terms ); $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) ); $ret = array_unique( explode('|', $terms) ); return $ret; diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php index ef47a0aa4b..41b246f082 100644 --- a/languages/classes/LanguageJa.php +++ b/languages/classes/LanguageJa.php @@ -6,18 +6,15 @@ * @ingroup Language */ class LanguageJa extends Language { - function stripForSearch( $string ) { - # MySQL fulltext index doesn't grok utf-8, so we - # need to fold cases and convert to hex + function stripForSearch( $string, $doStrip = true ) { + $s = $string; - # not for LuceneSearch, because LSearch - # will split the text to words itself - if ( $wgSearchType != 'LuceneSearch' ) { - # Strip known punctuation ? - #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f + if ( $doStrip == true ) { + // Strip known punctuation ? + // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f - # Space strings of like hiragana/katakana/kanji + // Space strings of like hiragana/katakana/kanji $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' @@ -25,14 +22,14 @@ class LanguageJa extends Language { . '|\xe9[\x80-\xa5][\x80-\xbf]' . '|\xe9\xa6[\x80-\x99])'; # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 - $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s ); + $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/"; + $s = self::wordSegmentation( $s, $reg ); } - # Double-width roman characters: ff00-ff5f ~= 0020-007f - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); + // Double-width roman characters + $s = self::convertDoubleWidth( $s ); # Do general case folding and UTF-8 armoring - return parent::stripForSearch( $s ); + return parent::stripForSearch( $s, $doStrip ); } # Italic is not appropriate for Japanese script diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php index 47b20e57ef..f00ac31e6e 100644 --- a/languages/classes/LanguageYue.php +++ b/languages/classes/LanguageYue.php @@ -3,32 +3,24 @@ * @ingroup Language */ class LanguageYue extends Language { - function stripForSearch( $string ) { + function stripForSearch( $string, $doStrip = true ) { wfProfileIn( __METHOD__ ); - global $wgSearchType; - $s = $string; + // Double-width roman characters + $s = self::convertDoubleWidth( $string ); - // Double-width roman characters: ff00-ff5f ~= 0020-007f - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); - - if ( $wgSearchType != 'LuceneSearch' ) { + if ( $doStrip == true ) { // eventually this should be a word segmentation; // for now just treat each character as a word. - // Not for LuceneSearch, because LSearch will - // split the text to words itself. // @todo Fixme: only do this for Han characters... - $s = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - " $1 ", $s); - $s = preg_replace( '/ +/', ' ', $s ); + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; + $s = self::wordSegmentation( $s, $reg ); } $s = trim( $s ); // Do general case folding and UTF-8 armoring - $s = parent::stripForSearch( $s ); + $s = parent::stripForSearch( $s, $doStrip ); wfProfileOut( __METHOD__ ); return $s; } diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php index b63d2299d7..490808bdaf 100644 --- a/languages/classes/LanguageZh.php +++ b/languages/classes/LanguageZh.php @@ -173,15 +173,16 @@ class LanguageZh extends LanguageZh_hans { } // word segmentation - function stripForSearch( $string ) { + function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) { wfProfileIn( __METHOD__ ); // always convert to zh-hans before indexing. it should be // better to use zh-hans for search, since conversion from // Traditional to Simplified is less ambiguous than the // other way around - $s = $this->mConverter->autoConvert( $string, 'zh-hans' ); - $s = parent::stripForSearch( $s ); + $s = $this->mConverter->autoConvert( $string, $autoVariant ); + // LanguageZh_hans::stripForSearch + $s = parent::stripForSearch( $s, $doStrip ); wfProfileOut( __METHOD__ ); return $s; @@ -189,6 +190,7 @@ class LanguageZh extends LanguageZh_hans { function convertForSearchResult( $termsArray ) { $terms = implode( '|', $termsArray ); + $terms = self::convertDoubleWidth( $terms ); $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) ); $ret = array_unique( explode('|', $terms) ); return $ret; diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php index 6ab6e9d46f..2f81960b16 100644 --- a/languages/classes/LanguageZh_hans.php +++ b/languages/classes/LanguageZh_hans.php @@ -8,33 +8,25 @@ class LanguageZh_hans extends Language { return false; } - function stripForSearch( $string ) { + function stripForSearch( $string, $doStrip = true ) { wfProfileIn( __METHOD__ ); - global $wgSearchType; - $s = $string; + // Double-width roman characters + $s = self::convertDoubleWidth( $string ); - // Double-width roman characters: ff00-ff5f ~= 0020-007f - $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); - $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); - - if ( $wgSearchType != 'LuceneSearch' ) { + if ( $doStrip == true ) { // Eventually this should be a word segmentation; // for now just treat each character as a word. - // Not for LuceneSearch, because LSearch will - // split the text to words itself. // @todo Fixme: only do this for Han characters... - $s = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - " $1 ", $s); - $s = preg_replace( '/ +/', ' ', $s ); + $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/"; + $s = self::wordSegmentation( $s, $reg ); } $s = trim( $s ); // Do general case folding and UTF-8 armoring - $s = parent::stripForSearch( $s ); + $s = parent::stripForSearch( $s, $doStrip ); wfProfileOut( __METHOD__ ); return $s; } -} +} \ No newline at end of file