From: Philip Tzou <philip@users.mediawiki.org>
Date: Thu, 7 Jan 2010 04:50:32 +0000 (+0000)
Subject: follow-up r60743.
X-Git-Tag: 1.31.0-rc.0~38374
X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/banques/ajouter.php?a=commitdiff_plain;h=8bbfbf562874f50faa24ef523c3a507b30584dfc;p=lhc%2Fweb%2Fwiklou.git

follow-up r60743.
1. Changed the conditions, not only for LuceneSearch, but also more commonly to others.
2. Reduced code duplication.
---

diff --git a/languages/Language.php b/languages/Language.php
index 0ee04847fc..6aa9c9b2af 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1695,9 +1695,9 @@ class Language {
 	 * @param $string String
 	 * @return String
 	 */
-	function stripForSearch( $string ) {
-		global $wgDBtype, $wgSearchType;
-		if ( $wgDBtype != 'mysql' || $wgSearchType == 'LuceneSearch' ) {
+	function stripForSearch( $string, $doStrip = true ) {
+		global $wgDBtype;
+		if ( $wgDBtype != 'mysql' || $doStrip == false ) {
 			return $string;
 		}
 
@@ -1767,6 +1767,22 @@ class Language {
 		return $this->minSearchLength;
 	}
 
+	/**
+	 * convert double-width roman characters to single-width.
+	 * range: ff00-ff5f ~= 0020-007f
+	 */
+	protected static function convertDoubleWidth( $string ) {
+		$string = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $string );
+		$string = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $string );
+		return $string;
+	}
+
+	protected static function wordSegmentation( $string, $pattern ) {
+		$string = preg_replace( $pattern, " $1 ", $string );
+		$string = preg_replace( '/ +/', ' ', $string );
+		return $string;
+	}
+
 	function convertForSearchResult( $termsArray ) {
 		# some languages, e.g. Chinese, need to do a conversion
 		# in order for search results to be displayed correctly
diff --git a/languages/classes/LanguageGan.php b/languages/classes/LanguageGan.php
index 9bca7d6794..b10d720a9b 100644
--- a/languages/classes/LanguageGan.php
+++ b/languages/classes/LanguageGan.php
@@ -137,43 +137,14 @@ class LanguageGan extends LanguageZh {
 	}
 
 	// word segmentation
-	function stripForSearch( $string ) {
-		wfProfileIn( __METHOD__ );
-		global $wgSearchType;
-
-		// always convert to gan-hans before indexing. it should be
-		// better to use gan-hans for search, since conversion from
-		// Traditional to Simplified is less ambiguous than the
-		// other way around
-		$s = $this->mConverter->autoConvert($string, 'gan-hans');
-
-		// Double-width roman characters: ff00-ff5f ~= 0020-007f
-		$s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-		$s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-		if ( $wgSearchType != 'LuceneSearch' ) {
-			// eventually this should be a word segmentation;
-			// for now just treat each character as a word.
-			// Not for LuceneSearch, because LSearch will
-			// split the text to words itself.
-			// @todo Fixme: only do this for Han characters...
-			$s = preg_replace(
-					"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-					" $1 ", $s);
-			$s = preg_replace( '/ +/', ' ', $s );
-		}
-
-		$s = trim( $s );
-
-		// Do general case folding and UTF-8 armoring
-		$s = parent::stripForSearch( $s );
-		wfProfileOut( __METHOD__ );
-		return $s;
-
+	function stripForSearch( $string, $doStrip = true ) {
+		// LanguageZh::stripForSearch
+		return parent::stripForSearch( $string, $doStrip, 'gan-hans' );
 	}
 
 	function convertForSearchResult( $termsArray ) {
 		$terms = implode( '|', $termsArray );
+		$terms = self::convertDoubleWidth( $terms );
 		$terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
 		$ret = array_unique( explode('|', $terms) );
 		return $ret;
diff --git a/languages/classes/LanguageJa.php b/languages/classes/LanguageJa.php
index ef47a0aa4b..41b246f082 100644
--- a/languages/classes/LanguageJa.php
+++ b/languages/classes/LanguageJa.php
@@ -6,18 +6,15 @@
  * @ingroup Language
  */
 class LanguageJa extends Language {
-	function stripForSearch( $string ) {
-		# MySQL fulltext index doesn't grok utf-8, so we
-		# need to fold cases and convert to hex
+	function stripForSearch( $string, $doStrip = true ) {
+
 		$s = $string;
 
-		# not for LuceneSearch, because LSearch
-		# will split the text to words itself
-		if ( $wgSearchType != 'LuceneSearch' ) {
-			# Strip known punctuation ?
-			#$s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
+		if ( $doStrip == true ) {
+			// Strip known punctuation ?
+			// $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
 
-			# Space strings of like hiragana/katakana/kanji
+			// Space strings of like hiragana/katakana/kanji
 			$hiragana = '(?:\xe3(?:\x81[\x80-\xbf]|\x82[\x80-\x9f]))'; # U3040-309f
 			$katakana = '(?:\xe3(?:\x82[\xa0-\xbf]|\x83[\x80-\xbf]))'; # U30a0-30ff
 			$kanji = '(?:\xe3[\x88-\xbf][\x80-\xbf]'
@@ -25,14 +22,14 @@ class LanguageJa extends Language {
 				. '|\xe9[\x80-\xa5][\x80-\xbf]'
 				. '|\xe9\xa6[\x80-\x99])';
 				# U3200-9999 = \xe3\x88\x80-\xe9\xa6\x99
-			$s = preg_replace( "/({$hiragana}+|{$katakana}+|{$kanji}+)/", ' $1 ', $s );
+			$reg = "/({$hiragana}+|{$katakana}+|{$kanji}+)/";
+			$s = self::wordSegmentation( $s, $reg );
 		}
-		# Double-width roman characters: ff00-ff5f ~= 0020-007f
-		$s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-		$s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
+		// Double-width roman characters
+		$s = self::convertDoubleWidth( $s );
 		
 		# Do general case folding and UTF-8 armoring
-		return parent::stripForSearch( $s );
+		return parent::stripForSearch( $s, $doStrip );
 	}
 
 	# Italic is not appropriate for Japanese script
diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php
index 47b20e57ef..f00ac31e6e 100644
--- a/languages/classes/LanguageYue.php
+++ b/languages/classes/LanguageYue.php
@@ -3,32 +3,24 @@
  * @ingroup Language
  */
 class LanguageYue extends Language {
-	function stripForSearch( $string ) {
+	function stripForSearch( $string, $doStrip = true ) {
 		wfProfileIn( __METHOD__ );
-		global $wgSearchType;
 
-		$s = $string;
+		// Double-width roman characters
+		$s = self::convertDoubleWidth( $string );
 
-		// Double-width roman characters: ff00-ff5f ~= 0020-007f
-		$s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-		$s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-		if ( $wgSearchType != 'LuceneSearch' ) {
+		if ( $doStrip == true ) {
 			// eventually this should be a word segmentation;
 			// for now just treat each character as a word.
-			// Not for LuceneSearch, because LSearch will
-			// split the text to words itself.
 			// @todo Fixme: only do this for Han characters...
-			$s = preg_replace(
-					"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-					" $1 ", $s);
-			$s = preg_replace( '/ +/', ' ', $s );
+			$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+			$s = self::wordSegmentation( $s, $reg );
 		}
 
 		$s = trim( $s );
 
 		// Do general case folding and UTF-8 armoring
-		$s = parent::stripForSearch( $s );
+		$s = parent::stripForSearch( $s, $doStrip );
 		wfProfileOut( __METHOD__ );
 		return $s;
 	}
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php
index b63d2299d7..490808bdaf 100644
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -173,15 +173,16 @@ class LanguageZh extends LanguageZh_hans {
 	}
 
 	// word segmentation
-	function stripForSearch( $string ) {
+	function stripForSearch( $string, $doStrip = true, $autoVariant = 'zh-hans' ) {
 		wfProfileIn( __METHOD__ );
 
 		// always convert to zh-hans before indexing. it should be
 		// better to use zh-hans for search, since conversion from
 		// Traditional to Simplified is less ambiguous than the
 		// other way around
-		$s = $this->mConverter->autoConvert( $string, 'zh-hans' );
-		$s = parent::stripForSearch( $s );
+		$s = $this->mConverter->autoConvert( $string, $autoVariant );
+		// LanguageZh_hans::stripForSearch
+		$s = parent::stripForSearch( $s, $doStrip );
 		wfProfileOut( __METHOD__ );
 		return $s;
 
@@ -189,6 +190,7 @@ class LanguageZh extends LanguageZh_hans {
 
 	function convertForSearchResult( $termsArray ) {
 		$terms = implode( '|', $termsArray );
+		$terms = self::convertDoubleWidth( $terms );
 		$terms = implode( '|', $this->mConverter->autoConvertToAllVariants( $terms ) );
 		$ret = array_unique( explode('|', $terms) );
 		return $ret;
diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php
index 6ab6e9d46f..2f81960b16 100644
--- a/languages/classes/LanguageZh_hans.php
+++ b/languages/classes/LanguageZh_hans.php
@@ -8,33 +8,25 @@ class LanguageZh_hans extends Language {
 		return false;
 	}
 	
-	function stripForSearch( $string ) {
+	function stripForSearch( $string, $doStrip = true ) {
 		wfProfileIn( __METHOD__ );
-		global $wgSearchType;
 
-		$s = $string;
+		// Double-width roman characters
+		$s = self::convertDoubleWidth( $string );
 
-		// Double-width roman characters: ff00-ff5f ~= 0020-007f
-		$s = preg_replace( '/\xef\xbc([\x80-\xbf])/e', 'chr((ord("$1") & 0x3f) + 0x20)', $s );
-		$s = preg_replace( '/\xef\xbd([\x80-\x99])/e', 'chr((ord("$1") & 0x3f) + 0x60)', $s );
-
-		if ( $wgSearchType != 'LuceneSearch' ) {
+		if ( $doStrip == true ) {
 			// Eventually this should be a word segmentation;
 			// for now just treat each character as a word.
-			// Not for LuceneSearch, because LSearch will
-			// split the text to words itself.
 			// @todo Fixme: only do this for Han characters...
-			$s = preg_replace(
-					"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-					" $1 ", $s);
-			$s = preg_replace( '/ +/', ' ', $s );
+			$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
+			$s = self::wordSegmentation( $s, $reg );
 		}
 
 		$s = trim( $s );
 
 		// Do general case folding and UTF-8 armoring
-		$s = parent::stripForSearch( $s );
+		$s = parent::stripForSearch( $s, $doStrip );
 		wfProfileOut( __METHOD__ );
 		return $s;
 	}
-}
+}
\ No newline at end of file