1. Add conditions to stripForSearch for LuceneSearch / MWSearch.

author Philip Tzou <philip@users.mediawiki.org>

Wed, 6 Jan 2010 19:51:29 +0000 (19:51 +0000)

committer Philip Tzou <philip@users.mediawiki.org>

Wed, 6 Jan 2010 19:51:29 +0000 (19:51 +0000)
author Philip Tzou <philip@users.mediawiki.org>
Wed, 6 Jan 2010 19:51:29 +0000 (19:51 +0000)
committer Philip Tzou <philip@users.mediawiki.org>
Wed, 6 Jan 2010 19:51:29 +0000 (19:51 +0000)
diff --git a/languages/Language.php b/languages/Language.php

index 19571d8..a368b31 100644 (file)
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1697,12 +1697,11 @@ class Language {
          * @return String
          */
         function stripForSearch( $string ) {
-               global $wgDBtype;
-               if ( $wgDBtype != 'mysql' ) {
+               global $wgDBtype, $wgSearchType;
+               if ( $wgDBtype != 'mysql' or $wgSearchType == 'LuceneSearch' ) {
                         return $string;
                 }
  
-
                 wfProfileIn( __METHOD__ );
                 
                 // MySQL fulltext index doesn't grok utf-8, so we
diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php

index 151a99c..9bca7d6 100644 (file)
--- a/languages/classes/LanguageGan.php
+++ b/languages/classes/LanguageGan.php
@@ -139,23 +139,36 @@ class LanguageGan extends LanguageZh {
         // word segmentation
         function stripForSearch( $string ) {
                 wfProfileIn( __METHOD__ );
-
-               // eventually this should be a word segmentation
-               // for now just treat each character as a word
-               // @todo Fixme: only do this for Han characters...
-               $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                               " $1", $string);
-
-        //always convert to gan-hans before indexing. it should be
-               //better to use gan-hans for search, since conversion from
-               //Traditional to Simplified is less ambiguous than the
-               //other way around
-
-               $t = $this->mConverter->autoConvert($t, 'gan-hans');
-               $t = parent::stripForSearch( $t );
+               global $wgSearchType;
+
+               // always convert to gan-hans before indexing. it should be
+               // better to use gan-hans for search, since conversion from
+               // Traditional to Simplified is less ambiguous than the
+               // other way around
+               $s = $this->mConverter->autoConvert($string, 'gan-hans');
+
+               // Double-width roman characters: ff00-ff5f ~= 0020-007f
+               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+               if ( $wgSearchType != 'LuceneSearch' ) {
+                       // eventually this should be a word segmentation;
+                       // for now just treat each character as a word.
+                       // Not for LuceneSearch, because LSearch will
+                       // split the text to words itself.
+                       // @todo Fixme: only do this for Han characters...
+                       $s = preg_replace(
+                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                                       " $1 ", $s);
+                       $s = preg_replace( '/ +/', ' ', $s );
+               }
+
+               $s = trim( $s );
+
+               // Do general case folding and UTF-8 armoring
+               $s = parent::stripForSearch( $s );
                 wfProfileOut( __METHOD__ );
-               return $t;
+               return $s;
  
         }
  
diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php

index 72c06e1..ef47a0a 100644 (file)
--- a/languages/classes/LanguageJa.php
+++ b/languages/classes/LanguageJa.php
@@ -11,23 +11,26 @@ class LanguageJa extends Language {
                 # need to fold cases and convert to hex
                 $s = $string;
  
-               # Strip known punctuation ?
-               #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
-
-               # Space strings of like hiragana/katakana/kanji
-               $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
-               $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
-               $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
-                       . '|[\xe4-\xe8][\x80-\xbf]{2}'
-                       . '|\xe9[\x80-\xa5][\x80-\xbf]'
-                       . '|\xe9\xa6[\x80-\x99])';
-                       # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-               $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+               # not for LuceneSearch, because LSearch
+               # will split the text to words itself
+               if ( $wgSearchType != 'LuceneSearch' ) {
+                       # Strip known punctuation ?
+                       #$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
  
+                       # Space strings of like hiragana/katakana/kanji
+                       $hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
+                       $katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
+                       $kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
+                               . '|[\xe4-\xe8][\x80-\xbf]{2}'
+                               . '|\xe9[\x80-\xa5][\x80-\xbf]'
+                               . '|\xe9\xa6[\x80-\x99])';
+                               # U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
+                       $s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+               }
                 # Double-width roman characters: ff00-ff5f ~= 0020-007f
                 $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
                 $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
+               
                 # Do general case folding and UTF-8 armoring
                 return parent::stripForSearch( $s );
         }
diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php

index 4191fba..47b20e5 100644 (file)
--- a/languages/classes/LanguageYue.php
+++ b/languages/classes/LanguageYue.php
@@ -5,17 +5,31 @@
  class LanguageYue extends Language {
         function stripForSearch( $string ) {
                 wfProfileIn( __METHOD__ );
+               global $wgSearchType;
  
-               // eventually this should be a word segmentation
-               // for now just treat each character as a word
-               // @todo Fixme: only do this for Han characters...
-               $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                               " $1", $string);
+               $s = $string;
+
+               // Double-width roman characters: ff00-ff5f ~= 0020-007f
+               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+               if ( $wgSearchType != 'LuceneSearch' ) {
+                       // eventually this should be a word segmentation;
+                       // for now just treat each character as a word.
+                       // Not for LuceneSearch, because LSearch will
+                       // split the text to words itself.
+                       // @todo Fixme: only do this for Han characters...
+                       $s = preg_replace(
+                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                                       " $1 ", $s);
+                       $s = preg_replace( '/ +/', ' ', $s );
+               }
+
+               $s = trim( $s );
  
                 // Do general case folding and UTF-8 armoring
-               $t = parent::stripForSearch( $t );
+               $s = parent::stripForSearch( $s );
                 wfProfileOut( __METHOD__ );
-               return $t;
+               return $s;
         }
  }
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php

index 056b511..b63d229 100644 (file)
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -176,15 +176,14 @@ class LanguageZh extends LanguageZh_hans {
         function stripForSearch( $string ) {
                 wfProfileIn( __METHOD__ );
  
-        //always convert to zh-hans before indexing. it should be
-               //better to use zh-hans for search, since conversion from
-               //Traditional to Simplified is less ambiguous than the
-               //other way around
-
-               $t = $this->mConverter->autoConvert( $string, 'zh-hans' );
-               $t = parent::stripForSearch( $t );
+               // always convert to zh-hans before indexing. it should be
+               // better to use zh-hans for search, since conversion from
+               // Traditional to Simplified is less ambiguous than the
+               // other way around
+               $s = $this->mConverter->autoConvert( $string, 'zh-hans' );
+               $s = parent::stripForSearch( $s );
                 wfProfileOut( __METHOD__ );
-               return $t;
+               return $s;
  
         }
  
diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php

index ce542fd..6ab6e9d 100644 (file)
--- a/languages/classes/LanguageZh_hans.php
+++ b/languages/classes/LanguageZh_hans.php
@@ -9,18 +9,32 @@ class LanguageZh_hans extends Language {
         }
         
         function stripForSearch( $string ) {
-               // Eventually this should be a word segmentation;
-               // for now just treat each character as a word.
-               //
-               // Note we put a space on both sides to cover cases
-               // where a number or Latin char follows a Han char.
-               //
-               // @todo Fixme: only do this for Han characters...
-               $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                               " $1 ", $string);
-               $t = preg_replace( '/ +/', ' ', $t );
-               $t = trim( $t );
-               return parent::stripForSearch( $t );
+               wfProfileIn( __METHOD__ );
+               global $wgSearchType;
+
+               $s = $string;
+
+               // Double-width roman characters: ff00-ff5f ~= 0020-007f
+               $s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+               $s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+               if ( $wgSearchType != 'LuceneSearch' ) {
+                       // Eventually this should be a word segmentation;
+                       // for now just treat each character as a word.
+                       // Not for LuceneSearch, because LSearch will
+                       // split the text to words itself.
+                       // @todo Fixme: only do this for Han characters...
+                       $s = preg_replace(
+                                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                                       " $1 ", $s);
+                       $s = preg_replace( '/ +/', ' ', $s );
+               }
+
+               $s = trim( $s );
+
+               // Do general case folding and UTF-8 armoring
+               $s = parent::stripForSearch( $s );
+               wfProfileOut( __METHOD__ );
+               return $s;
         }
  }
author	Philip Tzou <philip@users.mediawiki.org>
	Wed, 6 Jan 2010 19:51:29 +0000 (19:51 +0000)
committer	Philip Tzou <philip@users.mediawiki.org>
	Wed, 6 Jan 2010 19:51:29 +0000 (19:51 +0000)
languages/Language.php		patch \| blob \| history
languages/classes/LanguageGan.php		patch \| blob \| history
languages/classes/LanguageJa.php		patch \| blob \| history
languages/classes/LanguageYue.php		patch \| blob \| history
languages/classes/LanguageZh.php		patch \| blob \| history
languages/classes/LanguageZh_hans.php		patch \| blob \| history