From 339f0bb3d98df5e9f8f1600b235f4d5cadb4924e Mon Sep 17 00:00:00 2001
From: Philip Tzou <philip@users.mediawiki.org>
Date: Wed, 6 Jan 2010 19:51:29 +0000
Subject: [PATCH] 1. Add conditions to stripForSearch for LuceneSearch /
 MWSearch. 2. Add double-width roman characters conversion support to zh, gan,
 and yue.

---
 languages/Language.php                |  5 ++-
 languages/classes/LanguageGan.php     | 45 +++++++++++++++++----------
 languages/classes/LanguageJa.php      | 29 +++++++++--------
 languages/classes/LanguageYue.php     | 30 +++++++++++++-----
 languages/classes/LanguageZh.php      | 15 +++++----
 languages/classes/LanguageZh_hans.php | 40 ++++++++++++++++--------
 6 files changed, 103 insertions(+), 61 deletions(-)

diff --git a/languages/Language.php b/languages/Language.php
index 19571d8dbb..a368b31612 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1697,12 +1697,11 @@ class Language {
 	 * @return String
 	 */
 	function stripForSearch( $string ) {
-		global $wgDBtype;
-		if ( $wgDBtype != 'mysql' ) {
+		global $wgDBtype, $wgSearchType;
+		if ( $wgDBtype != 'mysql' or $wgSearchType == 'LuceneSearch' ) {
 			return $string;
 		}
 
-
 		wfProfileIn( __METHOD__ );
 		
 		// MySQL fulltext index doesn't grok utf-8, so we
diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php
index 151a99ccdd..9bca7d6794 100644
--- a/languages/classes/LanguageGan.php
+++ b/languages/classes/LanguageGan.php
@@ -139,23 +139,36 @@ class LanguageGan extends LanguageZh {
 	// word segmentation
 	function stripForSearch( $string ) {
 		wfProfileIn( __METHOD__ );
-
-		// eventually this should be a word segmentation
-		// for now just treat each character as a word
-		// @todo Fixme: only do this for Han characters...
-		$t = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-				" $1", $string);
-
-        //always convert to gan-hans before indexing. it should be
-		//better to use gan-hans for search, since conversion from
-		//Traditional to Simplified is less ambiguous than the
-		//other way around
-
-		$t = $this->mConverter->autoConvert($t, 'gan-hans');
-		$t = parent::stripForSearch( $t );
+		global $wgSearchType;
+
+		// always convert to gan-hans before indexing. it should be
+		// better to use gan-hans for search, since conversion from
+		// Traditional to Simplified is less ambiguous than the
+		// other way around
+		$s = $this->mConverter->autoConvert($string, 'gan-hans');
+
+		// Double-width roman characters: ff00-ff5f ~= 0020-007f
+		$s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+		$s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+		if ( $wgSearchType != 'LuceneSearch' ) {
+			// eventually this should be a word segmentation;
+			// for now just treat each character as a word.
+			// Not for LuceneSearch, because LSearch will
+			// split the text to words itself.
+			// @todo Fixme: only do this for Han characters...
+			$s = preg_replace(
+					"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+					" $1 ", $s);
+			$s = preg_replace( '/ +/', ' ', $s );
+		}
+
+		$s = trim( $s );
+
+		// Do general case folding and UTF-8 armoring
+		$s = parent::stripForSearch( $s );
 		wfProfileOut( __METHOD__ );
-		return $t;
+		return $s;
 
 	}
 
diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php
index 72c06e19fc..ef47a0aa4b 100644
--- a/languages/classes/LanguageJa.php
+++ b/languages/classes/LanguageJa.php
@@ -11,23 +11,26 @@ class LanguageJa extends Language {
 		# need to fold cases and convert to hex
 		$s = $string;
 
-		# Strip known punctuation ?
-		#$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
-
-		# Space strings of like hiragana/katakana/kanji
-		$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
-		$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
-		$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
-			. '|[\xe4-\xe8][\x80-\xbf]{2}'
-			. '|\xe9[\x80-\xa5][\x80-\xbf]'
-			. '|\xe9\xa6[\x80-\x99])';
-			# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-		$s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+		# not for LuceneSearch, because LSearch
+		# will split the text to words itself
+		if ( $wgSearchType != 'LuceneSearch' ) {
+			# Strip known punctuation ?
+			#$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
 
+			# Space strings of like hiragana/katakana/kanji
+			$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
+			$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
+			$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
+				. '|[\xe4-\xe8][\x80-\xbf]{2}'
+				. '|\xe9[\x80-\xa5][\x80-\xbf]'
+				. '|\xe9\xa6[\x80-\x99])';
+				# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
+			$s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+		}
 		# Double-width roman characters: ff00-ff5f ~= 0020-007f
 		$s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
 		$s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
+		
 		# Do general case folding and UTF-8 armoring
 		return parent::stripForSearch( $s );
 	}
diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php
index 4191fba5ee..47b20e57ef 100644
--- a/languages/classes/LanguageYue.php
+++ b/languages/classes/LanguageYue.php
@@ -5,17 +5,31 @@
 class LanguageYue extends Language {
 	function stripForSearch( $string ) {
 		wfProfileIn( __METHOD__ );
+		global $wgSearchType;
 
-		// eventually this should be a word segmentation
-		// for now just treat each character as a word
-		// @todo Fixme: only do this for Han characters...
-		$t = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-				" $1", $string);
+		$s = $string;
+
+		// Double-width roman characters: ff00-ff5f ~= 0020-007f
+		$s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+		$s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+		if ( $wgSearchType != 'LuceneSearch' ) {
+			// eventually this should be a word segmentation;
+			// for now just treat each character as a word.
+			// Not for LuceneSearch, because LSearch will
+			// split the text to words itself.
+			// @todo Fixme: only do this for Han characters...
+			$s = preg_replace(
+					"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+					" $1 ", $s);
+			$s = preg_replace( '/ +/', ' ', $s );
+		}
+
+		$s = trim( $s );
 
 		// Do general case folding and UTF-8 armoring
-		$t = parent::stripForSearch( $t );
+		$s = parent::stripForSearch( $s );
 		wfProfileOut( __METHOD__ );
-		return $t;
+		return $s;
 	}
 }
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php
index 056b5113a9..b63d2299d7 100644
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -176,15 +176,14 @@ class LanguageZh extends LanguageZh_hans {
 	function stripForSearch( $string ) {
 		wfProfileIn( __METHOD__ );
 
-        //always convert to zh-hans before indexing. it should be
-		//better to use zh-hans for search, since conversion from
-		//Traditional to Simplified is less ambiguous than the
-		//other way around
-
-		$t = $this->mConverter->autoConvert( $string, 'zh-hans' );
-		$t = parent::stripForSearch( $t );
+		// always convert to zh-hans before indexing. it should be
+		// better to use zh-hans for search, since conversion from
+		// Traditional to Simplified is less ambiguous than the
+		// other way around
+		$s = $this->mConverter->autoConvert( $string, 'zh-hans' );
+		$s = parent::stripForSearch( $s );
 		wfProfileOut( __METHOD__ );
-		return $t;
+		return $s;
 
 	}
 
diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php
index ce542fd819..6ab6e9d46f 100644
--- a/languages/classes/LanguageZh_hans.php
+++ b/languages/classes/LanguageZh_hans.php
@@ -9,18 +9,32 @@ class LanguageZh_hans extends Language {
 	}
 	
 	function stripForSearch( $string ) {
-		// Eventually this should be a word segmentation;
-		// for now just treat each character as a word.
-		//
-		// Note we put a space on both sides to cover cases
-		// where a number or Latin char follows a Han char.
-		//
-		// @todo Fixme: only do this for Han characters...
-		$t = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-				" $1 ", $string);
-		$t = preg_replace( '/ +/', ' ', $t );
-		$t = trim( $t );
-		return parent::stripForSearch( $t );
+		wfProfileIn( __METHOD__ );
+		global $wgSearchType;
+
+		$s = $string;
+
+		// Double-width roman characters: ff00-ff5f ~= 0020-007f
+		$s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
+		$s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+
+		if ( $wgSearchType != 'LuceneSearch' ) {
+			// Eventually this should be a word segmentation;
+			// for now just treat each character as a word.
+			// Not for LuceneSearch, because LSearch will
+			// split the text to words itself.
+			// @todo Fixme: only do this for Han characters...
+			$s = preg_replace(
+					"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+					" $1 ", $s);
+			$s = preg_replace( '/ +/', ' ', $s );
+		}
+
+		$s = trim( $s );
+
+		// Do general case folding and UTF-8 armoring
+		$s = parent::stripForSearch( $s );
+		wfProfileOut( __METHOD__ );
+		return $s;
 	}
 }
-- 
2.20.1