follow-up r60743.
authorPhilip Tzou <philip@users.mediawiki.org>
Thu, 7 Jan 2010 04:50:32 +0000 (04:50 +0000)
committerPhilip Tzou <philip@users.mediawiki.org>
Thu, 7 Jan 2010 04:50:32 +0000 (04:50 +0000)
1. Changed the conditions, not only for LuceneSearch, but also more commonly to others.
2. Reduced code duplication.

languages/Language.php
languages/classes/LanguageGan.php
languages/classes/LanguageJa.php
languages/classes/LanguageYue.php
languages/classes/LanguageZh.php
languages/classes/LanguageZh_hans.php

index 0ee0484..6aa9c9b 100644 (file)
@@ -1695,9 +1695,9 @@ class Language {
         * @param $string String
         * @return String
         */
-       function stripForSearch( $string ) {
-               global $wgDBtype, $wgSearchType;
-               if ( $wgDBtype != 'mysql' || $wgSearchType == 'LuceneSearch' ) {
+       function stripForSearch( $string, $doStrip = true ) {
+               global $wgDBtype;
+               if ( $wgDBtype != 'mysql' || $doStrip == false ) {
                        return $string;
                }
 
@@ -1767,6 +1767,22 @@ class Language {
                return $this->minSearchLength;
        }
 
+       /**
+        * convert double-width roman characters to single-width.
+        * range: ff00-ff5f ~= 0020-007f
+        */
+       protected static function convertDoubleWidth( $string ) {
+               $string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string );
+               $string = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string );
+               return $string;
+       }
+
+       protected static function wordSegmentation( $string, $pattern ) {
+               $string = preg_replace( $pattern, " $1 ", $string );
+               $string = preg_replace( '/ +/', ' ', $string );
+               return $string;
+       }
+
        function convertForSearchResult( $termsArray ) {
                # some languages, e.g. Chinese, need to do a conversion
                # in order for search results to be displayed correctly
index 9bca7d6..b10d720 100644 (file)
@@ -137,43 +137,14 @@ class LanguageGan extends LanguageZh {
        }
 
        // word segmentation
-       function stripForSearch( $string ) {
-               wfProfileIn( __METHOD__ );
-               global $wgSearchType;
-
-               // always convert to gan-hans before indexing. it should be
-               // better to use gan-hans for search, since conversion from
-               // Traditional to Simplified is less ambiguous than the
-               // other way around
-               $s = $this->mConverter->autoConvert($string, 'gan-hans');
-
-               // Double-width roman characters: ff00-ff5f ~= 0020-007f
-               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-               if ( $wgSearchType != 'LuceneSearch' ) {
-                       // eventually this should be a word segmentation;
-                       // for now just treat each character as a word.
-                       // Not for LuceneSearch, because LSearch will
-                       // split the text to words itself.
-                       // @todo Fixme: only do this for Han characters...
-                       $s = preg_replace(
-                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                                       " $1 ", $s);
-                       $s = preg_replace( '/ +/', ' ', $s );
-               }
-
-               $s = trim( $s );
-
-               // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s );
-               wfProfileOut( __METHOD__ );
-               return $s;
-
+       function stripForSearch( $string, $doStrip = true ) {
+               // LanguageZh::stripForSearch
+               return parent::stripForSearch( $string, $doStrip, 'gan-hans' );
        }
 
        function convertForSearchResult( $termsArray ) {
                $terms = implode( '|', $termsArray );
+               $terms = self::convertDoubleWidth( $terms );
                $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
                $ret = array_unique( explode('|', $terms) );
                return $ret;
index ef47a0a..41b246f 100644 (file)
@@ -6,18 +6,15 @@
  * @ingroup Language
  */
 class LanguageJa extends Language {
-       function stripForSearch( $string ) {
-               # MySQL fulltext index doesn't grok utf-8, so we
-               # need to fold cases and convert to hex
+       function stripForSearch( $string, $doStrip = true ) {
+
                $s = $string;
 
-               # not for LuceneSearch, because LSearch
-               # will split the text to words itself
-               if ( $wgSearchType != 'LuceneSearch' ) {
-                       # Strip known punctuation ?
-                       #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
+               if ( $doStrip == true ) {
+                       // Strip known punctuation ?
+                       // $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
 
-                       # Space strings of like hiragana/katakana/kanji
+                       // Space strings of like hiragana/katakana/kanji
                        $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
                        $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
                        $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
@@ -25,14 +22,14 @@ class LanguageJa extends Language {
                                . '|\xe9[\x80-\xa5][\x80-\xbf]'
                                . '|\xe9\xa6[\x80-\x99])';
                                # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-                       $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+                       $reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+                       $s = self::wordSegmentation( $s, $reg );
                }
-               # Double-width roman characters: ff00-ff5f ~= 0020-007f
-               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+               // Double-width roman characters
+               $s = self::convertDoubleWidth( $s );
                
                # Do general case folding and UTF-8 armoring
-               return parent::stripForSearch( $s );
+               return parent::stripForSearch( $s, $doStrip );
        }
 
        # Italic is not appropriate for Japanese script
index 47b20e5..f00ac31 100644 (file)
@@ -3,32 +3,24 @@
  * @ingroup Language
  */
 class LanguageYue extends Language {
-       function stripForSearch( $string ) {
+       function stripForSearch( $string, $doStrip = true ) {
                wfProfileIn( __METHOD__ );
-               global $wgSearchType;
 
-               $s = $string;
+               // Double-width roman characters
+               $s = self::convertDoubleWidth( $string );
 
-               // Double-width roman characters: ff00-ff5f ~= 0020-007f
-               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-               if ( $wgSearchType != 'LuceneSearch' ) {
+               if ( $doStrip == true ) {
                        // eventually this should be a word segmentation;
                        // for now just treat each character as a word.
-                       // Not for LuceneSearch, because LSearch will
-                       // split the text to words itself.
                        // @todo Fixme: only do this for Han characters...
-                       $s = preg_replace(
-                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                                       " $1 ", $s);
-                       $s = preg_replace( '/ +/', ' ', $s );
+                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+                       $s = self::wordSegmentation( $s, $reg );
                }
 
                $s = trim( $s );
 
                // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s );
+               $s = parent::stripForSearch( $s, $doStrip );
                wfProfileOut( __METHOD__ );
                return $s;
        }
index b63d229..490808b 100644 (file)
@@ -173,15 +173,16 @@ class LanguageZh extends LanguageZh_hans {
        }
 
        // word segmentation
-       function stripForSearch( $string ) {
+       function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
                wfProfileIn( __METHOD__ );
 
                // always convert to zh-hans before indexing. it should be
                // better to use zh-hans for search, since conversion from
                // Traditional to Simplified is less ambiguous than the
                // other way around
-               $s = $this->mConverter->autoConvert( $string, 'zh-hans' );
-               $s = parent::stripForSearch( $s );
+               $s = $this->mConverter->autoConvert( $string, $autoVariant );
+               // LanguageZh_hans::stripForSearch
+               $s = parent::stripForSearch( $s, $doStrip );
                wfProfileOut( __METHOD__ );
                return $s;
 
@@ -189,6 +190,7 @@ class LanguageZh extends LanguageZh_hans {
 
        function convertForSearchResult( $termsArray ) {
                $terms = implode( '|', $termsArray );
+               $terms = self::convertDoubleWidth( $terms );
                $terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
                $ret = array_unique( explode('|', $terms) );
                return $ret;
index 6ab6e9d..2f81960 100644 (file)
@@ -8,33 +8,25 @@ class LanguageZh_hans extends Language {
                return false;
        }
        
-       function stripForSearch( $string ) {
+       function stripForSearch( $string, $doStrip = true ) {
                wfProfileIn( __METHOD__ );
-               global $wgSearchType;
 
-               $s = $string;
+               // Double-width roman characters
+               $s = self::convertDoubleWidth( $string );
 
-               // Double-width roman characters: ff00-ff5f ~= 0020-007f
-               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-               if ( $wgSearchType != 'LuceneSearch' ) {
+               if ( $doStrip == true ) {
                        // Eventually this should be a word segmentation;
                        // for now just treat each character as a word.
-                       // Not for LuceneSearch, because LSearch will
-                       // split the text to words itself.
                        // @todo Fixme: only do this for Han characters...
-                       $s = preg_replace(
-                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                                       " $1 ", $s);
-                       $s = preg_replace( '/ +/', ' ', $s );
+                       $reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+                       $s = self::wordSegmentation( $s, $reg );
                }
 
                $s = trim( $s );
 
                // Do general case folding and UTF-8 armoring
-               $s = parent::stripForSearch( $s );
+               $s = parent::stripForSearch( $s, $doStrip );
                wfProfileOut( __METHOD__ );
                return $s;
        }
-}
+}
\ No newline at end of file