* (bug 5477) Searches for words less than 4 characters now work without
authorBrion Vibber <brion@users.mediawiki.org>
Tue, 25 Nov 2008 02:39:06 +0000 (02:39 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Tue, 25 Nov 2008 02:39:06 +0000 (02:39 +0000)
  requiring customization of MySQL server settings

Short words are padded so they now get indexed. Yay!

Adapted part of Werdna's patch, with some additional cleanup:
* Using 'U00' to pad instead of 'SMALL' to reduce false positives (eg search for "small*" could match "Smallville" and "SMALLc")
* Checking server's ft_min_word_len variable to see if we need to do anything. This preserves index compatibility with existing installations which have customized their index length.
* Some further cleanup on redundant code -- just toss everything through lc() and be done with it :D
* Cleaned out some more evals in zh and yue classes :P
* Fixed yue class to call the parent adjustor properly

RELEASE-NOTES
languages/Language.php
languages/classes/LanguageYue.php
languages/classes/LanguageZh.php

index 926e04c..c5323f2 100644 (file)
@@ -363,6 +363,8 @@ The following extensions are migrated into MediaWiki 1.14:
 * Improved scripting safety heuristics on SVG uploads.
 * (bug 11728) Unify layout of enhanced watchlist/recent changes
 * (bug 8702) Properly update stats when running nukePage maintenance script
+* (bug 5477) Searches for words less than 4 characters now work without
+  requiring customization of MySQL server settings
 
 === API changes in 1.14 ===
 
index 64b3124..8e4c576 100644 (file)
@@ -1523,25 +1523,61 @@ class Language {
                        return $string;
                }
 
-               # MySQL fulltext index doesn't grok utf-8, so we
-               # need to fold cases and convert to hex
 
                wfProfileIn( __METHOD__ );
-               if( function_exists( 'mb_strtolower' ) ) {
-                       $out = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "'U8' . bin2hex( \"$1\" )",
-                               mb_strtolower( $string ) );
-               } else {
-                       list( , $wikiLowerChars ) = self::getCaseMaps();
+               
+               // MySQL fulltext index doesn't grok utf-8, so we
+               // need to fold cases and convert to hex
+               $out = preg_replace_callback(
+                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                       array( $this, 'stripForSearchCallback' ),
+                       $this->lc( $string ) );
+               
+               // And to add insult to injury, the default indexing
+               // ignores short words... Pad them so we can pass them
+               // through without reconfiguring the server...
+               $minLength = $this->minSearchLength();
+               if( $minLength > 1 ) {
+                       $n = $minLength-1;
                        $out = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-                               $string );
+                               "/\b(\w{1,$n})\b/",
+                               "$1U800",
+                               $out );
                }
+               
                wfProfileOut( __METHOD__ );
                return $out;
        }
+       
+       /**
+        * Armor a case-folded UTF-8 string to get through MySQL's
+        * fulltext search without being mucked up by funny charset
+        * settings or anything else of the sort.
+        */
+       protected function stripForSearchCallback( $matches ) {
+               return 'U8' . bin2hex( $matches[1] );
+       }
+       
+       /**
+        * Check MySQL server's ft_min_word_len setting so we know
+        * if we need to pad short words...
+        */
+       protected function minSearchLength() {
+               if( !isset( $this->minSearchLength ) ) {
+                       $sql = "show global variables like 'ft\\_min\\_word\\_len'";
+                       $dbr = wfGetDB( DB_SLAVE );
+                       $result = $dbr->query( $sql );
+                       $row = $result->fetchObject();
+                       $result->free();
+                       
+                       if( $row && $row->Variable_name == 'ft_min_word_len' ) {
+                               $this->minSearchLength = intval( $row->Value );
+                       } else {
+                               $this->minSearchLength = 0;
+                       }
+               }
+               return $this->minSearchLength;
+       }
 
        function convertForSearchResult( $termsArray ) {
                # some languages, e.g. Chinese, need to do a conversion
index fdc227b..fc7f233 100644 (file)
@@ -4,20 +4,18 @@
  */
 class LanguageYue extends Language {
        function stripForSearch( $string ) {
-               # MySQL fulltext index doesn't grok utf-8, so we
-               # need to fold cases and convert to hex
-               # we also separate characters as "words"
-               if( function_exists( 'mb_strtolower' ) ) {
-                       return preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( \"$1\" )",
-                               mb_strtolower( $string ) );
-               } else {
-                       list( , $wikiLowerChars ) = Language::getCaseMaps();
-                       return preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-                               $string );
-               }
+               wfProfileIn( __METHOD__ );
+
+               // eventually this should be a word segmentation
+               // for now just treat each character as a word
+               // @fixme only do this for Han characters...
+               $t = preg_replace(
+                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                               " $1", $string);
+
+               // Do general case folding and UTF-8 armoring
+               $t = parent::stripForSearch( $t );
+               wfProfileOut( __METHOD__ );
+               return $t;
        }
 }
index 3d162a8..0936269 100644 (file)
@@ -126,14 +126,14 @@ class LanguageZh extends LanguageZh_hans {
 
        // word segmentation
        function stripForSearch( $string ) {
-               $fname="LanguageZh::stripForSearch";
-               wfProfileIn( $fname );
+               wfProfileIn( __METHOD__ );
 
                // eventually this should be a word segmentation
                // for now just treat each character as a word
+               // @fixme only do this for Han characters...
                $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' ' .\"$1\"", $string);
+                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                               " $1", $string);
 
         //always convert to zh-hans before indexing. it should be
                //better to use zh-hans for search, since conversion from
@@ -142,7 +142,7 @@ class LanguageZh extends LanguageZh_hans {
 
                $t = $this->mConverter->autoConvert($t, 'zh-hans');
                $t = parent::stripForSearch( $t );
-               wfProfileOut( $fname );
+               wfProfileOut( __METHOD__ );
                return $t;
 
        }