* (bug 5477) Searches for words less than 4 characters now work without

author Brion Vibber <brion@users.mediawiki.org>

Tue, 25 Nov 2008 02:39:06 +0000 (02:39 +0000)

committer Brion Vibber <brion@users.mediawiki.org>

Tue, 25 Nov 2008 02:39:06 +0000 (02:39 +0000)
author Brion Vibber <brion@users.mediawiki.org>
Tue, 25 Nov 2008 02:39:06 +0000 (02:39 +0000)
committer Brion Vibber <brion@users.mediawiki.org>
Tue, 25 Nov 2008 02:39:06 +0000 (02:39 +0000)
diff --git a/RELEASE-NOTES b/RELEASE-NOTES

index 926e04c..c5323f2 100644 (file)
--- a/RELEASE-NOTES
+++ b/RELEASE-NOTES
@@ -363,6 +363,8 @@ The following extensions are migrated into MediaWiki 1.14:
  * Improved scripting safety heuristics on SVG uploads.
  * (bug 11728) Unify layout of enhanced watchlist/recent changes
  * (bug 8702) Properly update stats when running nukePage maintenance script
+* (bug 5477) Searches for words less than 4 characters now work without
+  requiring customization of MySQL server settings
  
  === API changes in 1.14 ===
  
diff --git a/languages/Language.php b/languages/Language.php

index 64b3124..8e4c576 100644 (file)
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1523,25 +1523,61 @@ class Language {
                         return $string;
                 }
  
-               # MySQL fulltext index doesn't grok utf-8, so we
-               # need to fold cases and convert to hex
  
                 wfProfileIn( __METHOD__ );
-               if( function_exists( 'mb_strtolower' ) ) {
-                       $out = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "'U8' . bin2hex( \"$1\" )",
-                               mb_strtolower( $string ) );
-               } else {
-                       list( , $wikiLowerChars ) = self::getCaseMaps();
+               
+               // MySQL fulltext index doesn't grok utf-8, so we
+               // need to fold cases and convert to hex
+               $out = preg_replace_callback(
+                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                       array( $this, 'stripForSearchCallback' ),
+                       $this->lc( $string ) );
+               
+               // And to add insult to injury, the default indexing
+               // ignores short words... Pad them so we can pass them
+               // through without reconfiguring the server...
+               $minLength = $this->minSearchLength();
+               if( $minLength > 1 ) {
+                       $n = $minLength-1;
                         $out = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-                               $string );
+                               "/\b(\w{1,$n})\b/",
+                               "$1U800",
+                               $out );
                 }
+               
                 wfProfileOut( __METHOD__ );
                 return $out;
         }
+       
+       /**
+        * Armor a case-folded UTF-8 string to get through MySQL's
+        * fulltext search without being mucked up by funny charset
+        * settings or anything else of the sort.
+        */
+       protected function stripForSearchCallback( $matches ) {
+               return 'U8' . bin2hex( $matches[1] );
+       }
+       
+       /**
+        * Check MySQL server's ft_min_word_len setting so we know
+        * if we need to pad short words...
+        */
+       protected function minSearchLength() {
+               if( !isset( $this->minSearchLength ) ) {
+                       $sql = "show global variables like 'ft\\_min\\_word\\_len'";
+                       $dbr = wfGetDB( DB_SLAVE );
+                       $result = $dbr->query( $sql );
+                       $row = $result->fetchObject();
+                       $result->free();
+                       
+                       if( $row && $row->Variable_name == 'ft_min_word_len' ) {
+                               $this->minSearchLength = intval( $row->Value );
+                       } else {
+                               $this->minSearchLength = 0;
+                       }
+               }
+               return $this->minSearchLength;
+       }
  
         function convertForSearchResult( $termsArray ) {
                 # some languages, e.g. Chinese, need to do a conversion
diff --git a/languages/classes/LanguageYue.php b/languages/classes/LanguageYue.php

index fdc227b..fc7f233 100644 (file)
--- a/languages/classes/LanguageYue.php
+++ b/languages/classes/LanguageYue.php
@@ -4,20 +4,18 @@
   */
  class LanguageYue extends Language {
         function stripForSearch( $string ) {
-               # MySQL fulltext index doesn't grok utf-8, so we
-               # need to fold cases and convert to hex
-               # we also separate characters as "words"
-               if( function_exists( 'mb_strtolower' ) ) {
-                       return preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( \"$1\" )",
-                               mb_strtolower( $string ) );
-               } else {
-                       list( , $wikiLowerChars ) = Language::getCaseMaps();
-                       return preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-                               $string );
-               }
+               wfProfileIn( __METHOD__ );
+
+               // eventually this should be a word segmentation
+               // for now just treat each character as a word
+               // @fixme only do this for Han characters...
+               $t = preg_replace(
+                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                               " $1", $string);
+
+               // Do general case folding and UTF-8 armoring
+               $t = parent::stripForSearch( $t );
+               wfProfileOut( __METHOD__ );
+               return $t;
         }
  }
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php

index 3d162a8..0936269 100644 (file)
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -126,14 +126,14 @@ class LanguageZh extends LanguageZh_hans {
  
         // word segmentation
         function stripForSearch( $string ) {
-               $fname="LanguageZh::stripForSearch";
-               wfProfileIn( $fname );
+               wfProfileIn( __METHOD__ );
  
                 // eventually this should be a word segmentation
                 // for now just treat each character as a word
+               // @fixme only do this for Han characters...
                 $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' ' .\"$1\"", $string);
+                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                               " $1", $string);
  
          //always convert to zh-hans before indexing. it should be
                 //better to use zh-hans for search, since conversion from
@@ -142,7 +142,7 @@ class LanguageZh extends LanguageZh_hans {
  
                 $t = $this->mConverter->autoConvert($t, 'zh-hans');
                 $t = parent::stripForSearch( $t );
-               wfProfileOut( $fname );
+               wfProfileOut( __METHOD__ );
                 return $t;
  
         }
author	Brion Vibber <brion@users.mediawiki.org>
	Tue, 25 Nov 2008 02:39:06 +0000 (02:39 +0000)
committer	Brion Vibber <brion@users.mediawiki.org>
	Tue, 25 Nov 2008 02:39:06 +0000 (02:39 +0000)
RELEASE-NOTES		patch \| blob \| history
languages/Language.php		patch \| blob \| history
languages/classes/LanguageYue.php		patch \| blob \| history
languages/classes/LanguageZh.php		patch \| blob \| history