1. Add conditions to stripForSearch for LuceneSearch / MWSearch.
authorPhilip Tzou <philip@users.mediawiki.org>
Wed, 6 Jan 2010 19:51:29 +0000 (19:51 +0000)
committerPhilip Tzou <philip@users.mediawiki.org>
Wed, 6 Jan 2010 19:51:29 +0000 (19:51 +0000)
2. Add double-width roman characters conversion support to zh, gan, and yue.

languages/Language.php
languages/classes/LanguageGan.php
languages/classes/LanguageJa.php
languages/classes/LanguageYue.php
languages/classes/LanguageZh.php
languages/classes/LanguageZh_hans.php

index 19571d8..a368b31 100644 (file)
@@ -1697,12 +1697,11 @@ class Language {
         * @return String
         */
        function stripForSearch( $string ) {
-               global $wgDBtype;
-               if ( $wgDBtype != 'mysql' ) {
+               global $wgDBtype, $wgSearchType;
+               if ( $wgDBtype != 'mysql' or $wgSearchType == 'LuceneSearch' ) {
                        return $string;
                }
 
-
                wfProfileIn( __METHOD__ );
                
                // MySQL fulltext index doesn't grok utf-8, so we
index 151a99c..9bca7d6 100644 (file)
@@ -139,23 +139,36 @@ class LanguageGan extends LanguageZh {
        // word segmentation
        function stripForSearch( $string ) {
                wfProfileIn( __METHOD__ );
-
-               // eventually this should be a word segmentation
-               // for now just treat each character as a word
-               // @todo Fixme: only do this for Han characters...
-               $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                               " $1", $string);
-
-        //always convert to gan-hans before indexing. it should be
-               //better to use gan-hans for search, since conversion from
-               //Traditional to Simplified is less ambiguous than the
-               //other way around
-
-               $t = $this->mConverter->autoConvert($t, 'gan-hans');
-               $t = parent::stripForSearch( $t );
+               global $wgSearchType;
+
+               // always convert to gan-hans before indexing. it should be
+               // better to use gan-hans for search, since conversion from
+               // Traditional to Simplified is less ambiguous than the
+               // other way around
+               $s = $this->mConverter->autoConvert($string, 'gan-hans');
+
+               // Double-width roman characters: ff00-ff5f ~= 0020-007f
+               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+               if ( $wgSearchType != 'LuceneSearch' ) {
+                       // eventually this should be a word segmentation;
+                       // for now just treat each character as a word.
+                       // Not for LuceneSearch, because LSearch will
+                       // split the text to words itself.
+                       // @todo Fixme: only do this for Han characters...
+                       $s = preg_replace(
+                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                                       " $1 ", $s);
+                       $s = preg_replace( '/ +/', ' ', $s );
+               }
+
+               $s = trim( $s );
+
+               // Do general case folding and UTF-8 armoring
+               $s = parent::stripForSearch( $s );
                wfProfileOut( __METHOD__ );
-               return $t;
+               return $s;
 
        }
 
index 72c06e1..ef47a0a 100644 (file)
@@ -11,23 +11,26 @@ class LanguageJa extends Language {
                # need to fold cases and convert to hex
                $s = $string;
 
-               # Strip known punctuation ?
-               #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
-
-               # Space strings of like hiragana/katakana/kanji
-               $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
-               $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
-               $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
-                       . '|[\xe4-\xe8][\x80-\xbf]{2}'
-                       . '|\xe9[\x80-\xa5][\x80-\xbf]'
-                       . '|\xe9\xa6[\x80-\x99])';
-                       # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-               $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+               # not for LuceneSearch, because LSearch
+               # will split the text to words itself
+               if ( $wgSearchType != 'LuceneSearch' ) {
+                       # Strip known punctuation ?
+                       #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
 
+                       # Space strings of like hiragana/katakana/kanji
+                       $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
+                       $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
+                       $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
+                               . '|[\xe4-\xe8][\x80-\xbf]{2}'
+                               . '|\xe9[\x80-\xa5][\x80-\xbf]'
+                               . '|\xe9\xa6[\x80-\x99])';
+                               # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
+                       $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+               }
                # Double-width roman characters: ff00-ff5f ~= 0020-007f
                $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
                $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
+               
                # Do general case folding and UTF-8 armoring
                return parent::stripForSearch( $s );
        }
index 4191fba..47b20e5 100644 (file)
@@ -5,17 +5,31 @@
 class LanguageYue extends Language {
        function stripForSearch( $string ) {
                wfProfileIn( __METHOD__ );
+               global $wgSearchType;
 
-               // eventually this should be a word segmentation
-               // for now just treat each character as a word
-               // @todo Fixme: only do this for Han characters...
-               $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                               " $1", $string);
+               $s = $string;
+
+               // Double-width roman characters: ff00-ff5f ~= 0020-007f
+               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+               if ( $wgSearchType != 'LuceneSearch' ) {
+                       // eventually this should be a word segmentation;
+                       // for now just treat each character as a word.
+                       // Not for LuceneSearch, because LSearch will
+                       // split the text to words itself.
+                       // @todo Fixme: only do this for Han characters...
+                       $s = preg_replace(
+                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                                       " $1 ", $s);
+                       $s = preg_replace( '/ +/', ' ', $s );
+               }
+
+               $s = trim( $s );
 
                // Do general case folding and UTF-8 armoring
-               $t = parent::stripForSearch( $t );
+               $s = parent::stripForSearch( $s );
                wfProfileOut( __METHOD__ );
-               return $t;
+               return $s;
        }
 }
index 056b511..b63d229 100644 (file)
@@ -176,15 +176,14 @@ class LanguageZh extends LanguageZh_hans {
        function stripForSearch( $string ) {
                wfProfileIn( __METHOD__ );
 
-        //always convert to zh-hans before indexing. it should be
-               //better to use zh-hans for search, since conversion from
-               //Traditional to Simplified is less ambiguous than the
-               //other way around
-
-               $t = $this->mConverter->autoConvert( $string, 'zh-hans' );
-               $t = parent::stripForSearch( $t );
+               // always convert to zh-hans before indexing. it should be
+               // better to use zh-hans for search, since conversion from
+               // Traditional to Simplified is less ambiguous than the
+               // other way around
+               $s = $this->mConverter->autoConvert( $string, 'zh-hans' );
+               $s = parent::stripForSearch( $s );
                wfProfileOut( __METHOD__ );
-               return $t;
+               return $s;
 
        }
 
index ce542fd..6ab6e9d 100644 (file)
@@ -9,18 +9,32 @@ class LanguageZh_hans extends Language {
        }
        
        function stripForSearch( $string ) {
-               // Eventually this should be a word segmentation;
-               // for now just treat each character as a word.
-               //
-               // Note we put a space on both sides to cover cases
-               // where a number or Latin char follows a Han char.
-               //
-               // @todo Fixme: only do this for Han characters...
-               $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                               " $1 ", $string);
-               $t = preg_replace( '/ +/', ' ', $t );
-               $t = trim( $t );
-               return parent::stripForSearch( $t );
+               wfProfileIn( __METHOD__ );
+               global $wgSearchType;
+
+               $s = $string;
+
+               // Double-width roman characters: ff00-ff5f ~= 0020-007f
+               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+               if ( $wgSearchType != 'LuceneSearch' ) {
+                       // Eventually this should be a word segmentation;
+                       // for now just treat each character as a word.
+                       // Not for LuceneSearch, because LSearch will
+                       // split the text to words itself.
+                       // @todo Fixme: only do this for Han characters...
+                       $s = preg_replace(
+                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                                       " $1 ", $s);
+                       $s = preg_replace( '/ +/', ' ', $s );
+               }
+
+               $s = trim( $s );
+
+               // Do general case folding and UTF-8 armoring
+               $s = parent::stripForSearch( $s );
+               wfProfileOut( __METHOD__ );
+               return $s;
        }
 }