From 339f0bb3d98df5e9f8f1600b235f4d5cadb4924e Mon Sep 17 00:00:00 2001 From: Philip Tzou Date: Wed, 6 Jan 2010 19:51:29 +0000 Subject: [PATCH] 1. Add conditions to stripForSearch for LuceneSearch / MWSearch. 2. Add double-width roman characters conversion support to zh, gan, and yue. --- languages/Language.php | 5 ++- languages/classes/LanguageGan.php | 45 +++++++++++++++++---------- languages/classes/LanguageJa.php | 29 +++++++++-------- languages/classes/LanguageYue.php | 30 +++++++++++++----- languages/classes/LanguageZh.php | 15 +++++---- languages/classes/LanguageZh_hans.php | 40 ++++++++++++++++-------- 6 files changed, 103 insertions(+), 61 deletions(-) diff --git a/languages/Language.php b/languages/Language.php index 19571d8dbb..a368b31612 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1697,12 +1697,11 @@ class Language { * @return String */ function stripForSearch( $string ) { - global $wgDBtype; - if ( $wgDBtype != 'mysql' ) { + global $wgDBtype, $wgSearchType; + if ( $wgDBtype != 'mysql' or $wgSearchType == 'LuceneSearch' ) { return $string; } - wfProfileIn( __METHOD__ ); // MySQL fulltext index doesn't grok utf-8, so we diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php index 151a99ccdd..9bca7d6794 100644 --- a/languages/classes/LanguageGan.php +++ b/languages/classes/LanguageGan.php @@ -139,23 +139,36 @@ class LanguageGan extends LanguageZh { // word segmentation function stripForSearch( $string ) { wfProfileIn( __METHOD__ ); - - // eventually this should be a word segmentation - // for now just treat each character as a word - // @todo Fixme: only do this for Han characters... - $t = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - " $1", $string); - - //always convert to gan-hans before indexing. it should be - //better to use gan-hans for search, since conversion from - //Traditional to Simplified is less ambiguous than the - //other way around - - $t = $this->mConverter->autoConvert($t, 'gan-hans'); - $t = parent::stripForSearch( $t ); + global $wgSearchType; + + // always convert to gan-hans before indexing. it should be + // better to use gan-hans for search, since conversion from + // Traditional to Simplified is less ambiguous than the + // other way around + $s = $this->mConverter->autoConvert($string, 'gan-hans'); + + // Double-width roman characters: ff00-ff5f ~= 0020-007f + $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); + $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); + + if ( $wgSearchType != 'LuceneSearch' ) { + // eventually this should be a word segmentation; + // for now just treat each character as a word. + // Not for LuceneSearch, because LSearch will + // split the text to words itself. + // @todo Fixme: only do this for Han characters... + $s = preg_replace( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + " $1 ", $s); + $s = preg_replace( '/ +/', ' ', $s ); + } + + $s = trim( $s ); + + // Do general case folding and UTF-8 armoring + $s = parent::stripForSearch( $s ); wfProfileOut( __METHOD__ ); - return $t; + return $s; } diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php index 72c06e19fc..ef47a0aa4b 100644 --- a/languages/classes/LanguageJa.php +++ b/languages/classes/LanguageJa.php @@ -11,23 +11,26 @@ class LanguageJa extends Language { # need to fold cases and convert to hex $s = $string; - # Strip known punctuation ? - #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f - - # Space strings of like hiragana/katakana/kanji - $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f - $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff - $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' - . '|[\xe4-\xe8][\x80-\xbf]{2}' - . '|\xe9[\x80-\xa5][\x80-\xbf]' - . '|\xe9\xa6[\x80-\x99])'; - # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 - $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s ); + # not for LuceneSearch, because LSearch + # will split the text to words itself + if ( $wgSearchType != 'LuceneSearch' ) { + # Strip known punctuation ? + #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f + # Space strings of like hiragana/katakana/kanji + $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f + $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff + $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]' + . '|[\xe4-\xe8][\x80-\xbf]{2}' + . '|\xe9[\x80-\xa5][\x80-\xbf]' + . '|\xe9\xa6[\x80-\x99])'; + # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99 + $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s ); + } # Double-width roman characters: ff00-ff5f ~= 0020-007f $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); - + # Do general case folding and UTF-8 armoring return parent::stripForSearch( $s ); } diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php index 4191fba5ee..47b20e57ef 100644 --- a/languages/classes/LanguageYue.php +++ b/languages/classes/LanguageYue.php @@ -5,17 +5,31 @@ class LanguageYue extends Language { function stripForSearch( $string ) { wfProfileIn( __METHOD__ ); + global $wgSearchType; - // eventually this should be a word segmentation - // for now just treat each character as a word - // @todo Fixme: only do this for Han characters... - $t = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - " $1", $string); + $s = $string; + + // Double-width roman characters: ff00-ff5f ~= 0020-007f + $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); + $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); + + if ( $wgSearchType != 'LuceneSearch' ) { + // eventually this should be a word segmentation; + // for now just treat each character as a word. + // Not for LuceneSearch, because LSearch will + // split the text to words itself. + // @todo Fixme: only do this for Han characters... + $s = preg_replace( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + " $1 ", $s); + $s = preg_replace( '/ +/', ' ', $s ); + } + + $s = trim( $s ); // Do general case folding and UTF-8 armoring - $t = parent::stripForSearch( $t ); + $s = parent::stripForSearch( $s ); wfProfileOut( __METHOD__ ); - return $t; + return $s; } } diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php index 056b5113a9..b63d2299d7 100644 --- a/languages/classes/LanguageZh.php +++ b/languages/classes/LanguageZh.php @@ -176,15 +176,14 @@ class LanguageZh extends LanguageZh_hans { function stripForSearch( $string ) { wfProfileIn( __METHOD__ ); - //always convert to zh-hans before indexing. it should be - //better to use zh-hans for search, since conversion from - //Traditional to Simplified is less ambiguous than the - //other way around - - $t = $this->mConverter->autoConvert( $string, 'zh-hans' ); - $t = parent::stripForSearch( $t ); + // always convert to zh-hans before indexing. it should be + // better to use zh-hans for search, since conversion from + // Traditional to Simplified is less ambiguous than the + // other way around + $s = $this->mConverter->autoConvert( $string, 'zh-hans' ); + $s = parent::stripForSearch( $s ); wfProfileOut( __METHOD__ ); - return $t; + return $s; } diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php index ce542fd819..6ab6e9d46f 100644 --- a/languages/classes/LanguageZh_hans.php +++ b/languages/classes/LanguageZh_hans.php @@ -9,18 +9,32 @@ class LanguageZh_hans extends Language { } function stripForSearch( $string ) { - // Eventually this should be a word segmentation; - // for now just treat each character as a word. - // - // Note we put a space on both sides to cover cases - // where a number or Latin char follows a Han char. - // - // @todo Fixme: only do this for Han characters... - $t = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - " $1 ", $string); - $t = preg_replace( '/ +/', ' ', $t ); - $t = trim( $t ); - return parent::stripForSearch( $t ); + wfProfileIn( __METHOD__ ); + global $wgSearchType; + + $s = $string; + + // Double-width roman characters: ff00-ff5f ~= 0020-007f + $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s ); + $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s ); + + if ( $wgSearchType != 'LuceneSearch' ) { + // Eventually this should be a word segmentation; + // for now just treat each character as a word. + // Not for LuceneSearch, because LSearch will + // split the text to words itself. + // @todo Fixme: only do this for Han characters... + $s = preg_replace( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + " $1 ", $s); + $s = preg_replace( '/ +/', ' ', $s ); + } + + $s = trim( $s ); + + // Do general case folding and UTF-8 armoring + $s = parent::stripForSearch( $s ); + wfProfileOut( __METHOD__ ); + return $s; } } -- 2.20.1