follow-up r60743.

author Philip Tzou <philip@users.mediawiki.org>

Thu, 7 Jan 2010 04:50:32 +0000 (04:50 +0000)

committer Philip Tzou <philip@users.mediawiki.org>

Thu, 7 Jan 2010 04:50:32 +0000 (04:50 +0000)
author Philip Tzou <philip@users.mediawiki.org>
Thu, 7 Jan 2010 04:50:32 +0000 (04:50 +0000)
committer Philip Tzou <philip@users.mediawiki.org>
Thu, 7 Jan 2010 04:50:32 +0000 (04:50 +0000)
diff --git a/languages/Language.php b/languages/Language.php

index 0ee0484..6aa9c9b 100644 (file)
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1695,9 +1695,9 @@ class Language {
          * @param $string String
          * @return String
          */
-       function stripForSearch( $string ) {
-               global $wgDBtype, $wgSearchType;
-               if ( $wgDBtype != 'mysql' || $wgSearchType == 'LuceneSearch' ) {
+       function stripForSearch( $string, $doStrip = true ) {
+               global $wgDBtype;
+               if ( $wgDBtype != 'mysql' || $doStrip == false ) {
                         return $string;
                 }
  
@@ -1767,6 +1767,22 @@ class Language {
                 return $this->minSearchLength;
         }
  
+       /**
+        * convert double-width roman characters to single-width.
+        * range: ff00-ff5f ~= 0020-007f
+        */
+       protected static function convertDoubleWidth( $string ) {
+               $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string );
+               $string = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string );
+               return $string;
+       }
+
+       protected static function wordSegmentation( $string, $pattern ) {
+               $string = preg_replace( $pattern, " $1 ", $string );
+               $string = preg_replace( '/ +/', ' ', $string );
+               return $string;
+       }
+
         function convertForSearchResult( $termsArray ) {
                 # some languages, e.g. Chinese, need to do a conversion
                 # in order for search results to be displayed correctly
diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php

index 9bca7d6..b10d720 100644 (file)
--- a/languages/classes/LanguageGan.php
+++ b/languages/classes/LanguageGan.php
@@ -137,43 +137,14 @@ class LanguageGan extends LanguageZh {
         }
  
         // word segmentation
-       function stripForSearch( $string ) {
-               wfProfileIn( __METHOD__ );
-               global $wgSearchType;
-
-               // always convert to gan-hans before indexing. it should be
-               // better to use gan-hans for search, since conversion from
-               // Traditional to Simplified is less ambiguous than the
-               // other way around
-               $s = $this->mConverter->autoConvert($string, 'gan-hans');
-
-               // Double-width roman characters: ff00-ff5f ~= 0020-007f
-               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-               if ( $wgSearchType != 'LuceneSearch' ) {
-                       // eventually this should be a word segmentation;
-                       // for now just treat each character as a word.
-                       // Not for LuceneSearch, because LSearch will
-                       // split the text to words itself.
-                       // @todo Fixme: only do this for Han characters...
-                       $s = preg_replace(
-                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                                       " $1 ", $s);
-                       $s = preg_replace( '/ +/', ' ', $s );
-               }
-
-               $s = trim( $s );
-
-               // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s );
-               wfProfileOut( __METHOD__ );
-               return $s;
-
+       function stripForSearch( $string, $doStrip = true ) {
+               // LanguageZh::stripForSearch
+               return parent::stripForSearch( $string, $doStrip, 'gan-hans' );
         }
  
         function convertForSearchResult( $termsArray ) {
                 $terms = implode( '|', $termsArray );
+               $terms = self::convertDoubleWidth( $terms );
                 $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
                 $ret = array_unique( explode('|', $terms) );
                 return $ret;
diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php

index ef47a0a..41b246f 100644 (file)
--- a/languages/classes/LanguageJa.php
+++ b/languages/classes/LanguageJa.php
@@ -6,18 +6,15 @@
   * @ingroup Language
   */
  class LanguageJa extends Language {
-       function stripForSearch( $string ) {
-               # MySQL fulltext index doesn't grok utf-8, so we
-               # need to fold cases and convert to hex
+       function stripForSearch( $string, $doStrip = true ) {
+
                 $s = $string;
  
-               # not for LuceneSearch, because LSearch
-               # will split the text to words itself
-               if ( $wgSearchType != 'LuceneSearch' ) {
-                       # Strip known punctuation ?
-                       #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
+               if ( $doStrip == true ) {
+                       // Strip known punctuation ?
+                       // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
  
-                       # Space strings of like hiragana/katakana/kanji
+                       // Space strings of like hiragana/katakana/kanji
                         $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
                         $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
                         $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
@@ -25,14 +22,14 @@ class LanguageJa extends Language {
                                 . '|\xe9[\x80-\xa5][\x80-\xbf]'
                                 . '|\xe9\xa6[\x80-\x99])';
                                 # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-                       $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+                       $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+                       $s = self::wordSegmentation( $s, $reg );
                 }
-               # Double-width roman characters: ff00-ff5f ~= 0020-007f
-               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+               // Double-width roman characters
+               $s = self::convertDoubleWidth( $s );
                 
                 # Do general case folding and UTF-8 armoring
-               return parent::stripForSearch( $s );
+               return parent::stripForSearch( $s, $doStrip );
         }
  
         # Italic is not appropriate for Japanese script
diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php

index 47b20e5..f00ac31 100644 (file)
--- a/languages/classes/LanguageYue.php
+++ b/languages/classes/LanguageYue.php
@@ -3,32 +3,24 @@
   * @ingroup Language
   */
  class LanguageYue extends Language {
-       function stripForSearch( $string ) {
+       function stripForSearch( $string, $doStrip = true ) {
                 wfProfileIn( __METHOD__ );
-               global $wgSearchType;
  
-               $s = $string;
+               // Double-width roman characters
+               $s = self::convertDoubleWidth( $string );
  
-               // Double-width roman characters: ff00-ff5f ~= 0020-007f
-               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-               if ( $wgSearchType != 'LuceneSearch' ) {
+               if ( $doStrip == true ) {
                         // eventually this should be a word segmentation;
                         // for now just treat each character as a word.
-                       // Not for LuceneSearch, because LSearch will
-                       // split the text to words itself.
                         // @todo Fixme: only do this for Han characters...
-                       $s = preg_replace(
-                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                                       " $1 ", $s);
-                       $s = preg_replace( '/ +/', ' ', $s );
+                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+                       $s = self::wordSegmentation( $s, $reg );
                 }
  
                 $s = trim( $s );
  
                 // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s );
+               $s = parent::stripForSearch( $s, $doStrip );
                 wfProfileOut( __METHOD__ );
                 return $s;
         }
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php

index b63d229..490808b 100644 (file)
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -173,15 +173,16 @@ class LanguageZh extends LanguageZh_hans {
         }
  
         // word segmentation
-       function stripForSearch( $string ) {
+       function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
                 wfProfileIn( __METHOD__ );
  
                 // always convert to zh-hans before indexing. it should be
                 // better to use zh-hans for search, since conversion from
                 // Traditional to Simplified is less ambiguous than the
                 // other way around
-               $s = $this->mConverter->autoConvert( $string, 'zh-hans' );
-               $s = parent::stripForSearch( $s );
+               $s = $this->mConverter->autoConvert( $string, $autoVariant );
+               // LanguageZh_hans::stripForSearch
+               $s = parent::stripForSearch( $s, $doStrip );
                 wfProfileOut( __METHOD__ );
                 return $s;
  
@@ -189,6 +190,7 @@ class LanguageZh extends LanguageZh_hans {
  
         function convertForSearchResult( $termsArray ) {
                 $terms = implode( '|', $termsArray );
+               $terms = self::convertDoubleWidth( $terms );
                 $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
                 $ret = array_unique( explode('|', $terms) );
                 return $ret;
diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php

index 6ab6e9d..2f81960 100644 (file)
--- a/languages/classes/LanguageZh_hans.php
+++ b/languages/classes/LanguageZh_hans.php
@@ -8,33 +8,25 @@ class LanguageZh_hans extends Language {
                 return false;
         }
         
-       function stripForSearch( $string ) {
+       function stripForSearch( $string, $doStrip = true ) {
                 wfProfileIn( __METHOD__ );
-               global $wgSearchType;
  
-               $s = $string;
+               // Double-width roman characters
+               $s = self::convertDoubleWidth( $string );
  
-               // Double-width roman characters: ff00-ff5f ~= 0020-007f
-               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-               if ( $wgSearchType != 'LuceneSearch' ) {
+               if ( $doStrip == true ) {
                         // Eventually this should be a word segmentation;
                         // for now just treat each character as a word.
-                       // Not for LuceneSearch, because LSearch will
-                       // split the text to words itself.
                         // @todo Fixme: only do this for Han characters...
-                       $s = preg_replace(
-                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                                       " $1 ", $s);
-                       $s = preg_replace( '/ +/', ' ', $s );
+                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+                       $s = self::wordSegmentation( $s, $reg );
                 }
  
                 $s = trim( $s );
  
                 // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s );
+               $s = parent::stripForSearch( $s, $doStrip );
                 wfProfileOut( __METHOD__ );
                 return $s;
         }
-}
+}
+\ No newline at end of file
author	Philip Tzou <philip@users.mediawiki.org>
	Thu, 7 Jan 2010 04:50:32 +0000 (04:50 +0000)
committer	Philip Tzou <philip@users.mediawiki.org>
	Thu, 7 Jan 2010 04:50:32 +0000 (04:50 +0000)
languages/Language.php		patch \| blob \| history
languages/classes/LanguageGan.php		patch \| blob \| history
languages/classes/LanguageJa.php		patch \| blob \| history
languages/classes/LanguageYue.php		patch \| blob \| history
languages/classes/LanguageZh.php		patch \| blob \| history
languages/classes/LanguageZh_hans.php		patch \| blob \| history