From 7ebf0e431b0689b220d42b258e5ced37323bd564 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Tue, 25 Nov 2008 02:39:06 +0000
Subject: [PATCH] * (bug 5477) Searches for words less than 4 characters now
 work without   requiring customization of MySQL server settings

Short words are padded so they now get indexed. Yay!

Adapted part of Werdna's patch, with some additional cleanup:
* Using 'U00' to pad instead of 'SMALL' to reduce false positives (eg search for "small*" could match "Smallville" and "SMALLc")
* Checking server's ft_min_word_len variable to see if we need to do anything. This preserves index compatibility with existing installations which have customized their index length.
* Some further cleanup on redundant code -- just toss everything through lc() and be done with it :D
* Cleaned out some more evals in zh and yue classes :P
* Fixed yue class to call the parent adjustor properly
---
 RELEASE-NOTES                     |  2 ++
 languages/Language.php            | 60 ++++++++++++++++++++++++-------
 languages/classes/LanguageYue.php | 28 +++++++--------
 languages/classes/LanguageZh.php  | 10 +++---
 4 files changed, 68 insertions(+), 32 deletions(-)

diff --git a/RELEASE-NOTES b/RELEASE-NOTES
index 926e04c86e..c5323f2258 100644
--- a/RELEASE-NOTES
+++ b/RELEASE-NOTES
@@ -363,6 +363,8 @@ The following extensions are migrated into MediaWiki 1.14:
 * Improved scripting safety heuristics on SVG uploads.
 * (bug 11728) Unify layout of enhanced watchlist/recent changes
 * (bug 8702) Properly update stats when running nukePage maintenance script
+* (bug 5477) Searches for words less than 4 characters now work without
+  requiring customization of MySQL server settings
 
 === API changes in 1.14 ===
 
diff --git a/languages/Language.php b/languages/Language.php
index 64b31241ae..8e4c5760bd 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1523,25 +1523,61 @@ class Language {
 			return $string;
 		}
 
-		# MySQL fulltext index doesn't grok utf-8, so we
-		# need to fold cases and convert to hex
 
 		wfProfileIn( __METHOD__ );
-		if( function_exists( 'mb_strtolower' ) ) {
-			$out = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"'U8' . bin2hex( \"$1\" )",
-				mb_strtolower( $string ) );
-		} else {
-			list( , $wikiLowerChars ) = self::getCaseMaps();
+		
+		// MySQL fulltext index doesn't grok utf-8, so we
+		// need to fold cases and convert to hex
+		$out = preg_replace_callback(
+			"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+			array( $this, 'stripForSearchCallback' ),
+			$this->lc( $string ) );
+		
+		// And to add insult to injury, the default indexing
+		// ignores short words... Pad them so we can pass them
+		// through without reconfiguring the server...
+		$minLength = $this->minSearchLength();
+		if( $minLength > 1 ) {
+			$n = $minLength-1;
 			$out = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-				$string );
+				"/\b(\w{1,$n})\b/",
+				"$1U800",
+				$out );
 		}
+		
 		wfProfileOut( __METHOD__ );
 		return $out;
 	}
+	
+	/**
+	 * Armor a case-folded UTF-8 string to get through MySQL's
+	 * fulltext search without being mucked up by funny charset
+	 * settings or anything else of the sort.
+	 */
+	protected function stripForSearchCallback( $matches ) {
+		return 'U8' . bin2hex( $matches[1] );
+	}
+	
+	/**
+	 * Check MySQL server's ft_min_word_len setting so we know
+	 * if we need to pad short words...
+	 */
+	protected function minSearchLength() {
+		if( !isset( $this->minSearchLength ) ) {
+			$sql = "show global variables like 'ft\\_min\\_word\\_len'";
+			$dbr = wfGetDB( DB_SLAVE );
+			$result = $dbr->query( $sql );
+			$row = $result->fetchObject();
+			$result->free();
+			
+			if( $row && $row->Variable_name == 'ft_min_word_len' ) {
+				$this->minSearchLength = intval( $row->Value );
+			} else {
+				$this->minSearchLength = 0;
+			}
+		}
+		return $this->minSearchLength;
+	}
 
 	function convertForSearchResult( $termsArray ) {
 		# some languages, e.g. Chinese, need to do a conversion
diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php
index fdc227b3e8..fc7f233c4e 100644
--- a/languages/classes/LanguageYue.php
+++ b/languages/classes/LanguageYue.php
@@ -4,20 +4,18 @@
  */
 class LanguageYue extends Language {
 	function stripForSearch( $string ) {
-		# MySQL fulltext index doesn't grok utf-8, so we
-		# need to fold cases and convert to hex
-		# we also separate characters as "words"
-		if( function_exists( 'mb_strtolower' ) ) {
-			return preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"' U8' . bin2hex( \"$1\" )",
-				mb_strtolower( $string ) );
-		} else {
-			list( , $wikiLowerChars ) = Language::getCaseMaps();
-			return preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-				$string );
-		}
+		wfProfileIn( __METHOD__ );
+
+		// eventually this should be a word segmentation
+		// for now just treat each character as a word
+		// @fixme only do this for Han characters...
+		$t = preg_replace(
+				"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+				" $1", $string);
+
+		// Do general case folding and UTF-8 armoring
+		$t = parent::stripForSearch( $t );
+		wfProfileOut( __METHOD__ );
+		return $t;
 	}
 }
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php
index 3d162a8e51..093626909b 100644
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -126,14 +126,14 @@ class LanguageZh extends LanguageZh_hans {
 
 	// word segmentation
 	function stripForSearch( $string ) {
-		$fname="LanguageZh::stripForSearch";
-		wfProfileIn( $fname );
+		wfProfileIn( __METHOD__ );
 
 		// eventually this should be a word segmentation
 		// for now just treat each character as a word
+		// @fixme only do this for Han characters...
 		$t = preg_replace(
-				"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-				"' ' .\"$1\"", $string);
+				"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+				" $1", $string);
 
         //always convert to zh-hans before indexing. it should be
 		//better to use zh-hans for search, since conversion from
@@ -142,7 +142,7 @@ class LanguageZh extends LanguageZh_hans {
 
 		$t = $this->mConverter->autoConvert($t, 'zh-hans');
 		$t = parent::stripForSearch( $t );
-		wfProfileOut( $fname );
+		wfProfileOut( __METHOD__ );
 		return $t;
 
 	}
-- 
2.20.1