Factored MySQL-specific munging out of Language::stripForSearch() to DatabaseMysql...
[lhc/web/wiklou.git] / includes / db / DatabaseMysql.php
index 5ae4a22..63f267c 100644 (file)
@@ -7,6 +7,8 @@
  * @see Database
  */
 class DatabaseMysql extends DatabaseBase {
+       static $mMinSearchLength;
+
        function getType() {
                return 'mysql';
        }
@@ -367,6 +369,84 @@ class DatabaseMysql extends DatabaseBase {
                $this->query( "UNLOCK TABLES", $method );
        }
        
+       /**
+        * Converts some characters for MySQL's indexing to grok it correctly,
+        * and pads short words to overcome limitations.
+        */
+       function stripForSearch( $string ) {
+               global $wgContLang;
+
+               wfProfileIn( __METHOD__ );
+
+               // MySQL fulltext index doesn't grok utf-8, so we
+               // need to fold cases and convert to hex
+               $out = preg_replace_callback(
+                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                       array( $this, 'stripForSearchCallback' ),
+                       $wgContLang->lc( $string ) );
+
+               // And to add insult to injury, the default indexing
+               // ignores short words... Pad them so we can pass them
+               // through without reconfiguring the server...
+               $minLength = $this->minSearchLength();
+               if( $minLength > 1 ) {
+                       $n = $minLength - 1;
+                       $out = preg_replace(
+                               "/\b(\w{1,$n})\b/",
+                               "$1u800",
+                               $out );
+               }
+
+               // Periods within things like hostnames and IP addresses
+               // are also important -- we want a search for "example.com"
+               // or "192.168.1.1" to work sanely.
+               //
+               // MySQL's search seems to ignore them, so you'd match on
+               // "example.wikipedia.com" and "192.168.83.1" as well.
+               $out = preg_replace(
+                       "/(\w)\.(\w|\*)/u",
+                       "$1u82e$2",
+                       $out );
+
+               wfProfileOut( __METHOD__ );
+               
+               return $out;
+       }
+
+       /**
+        * Armor a case-folded UTF-8 string to get through MySQL's
+        * fulltext search without being mucked up by funny charset
+        * settings or anything else of the sort.
+        */
+       protected function stripForSearchCallback( $matches ) {
+               return 'u8' . bin2hex( $matches[1] );
+       }
+
+       /**
+        * Check MySQL server's ft_min_word_len setting so we know
+        * if we need to pad short words...
+        * 
+        * @return int
+        */
+       protected function minSearchLength() {
+               if( is_null( self::$mMinSearchLength ) ) {
+                       $sql = "show global variables like 'ft\\_min\\_word\\_len'";
+
+                       // Even though this query is pretty fast, let's not overload the master
+                       $dbr = wfGetDB( DB_SLAVE );
+                       $result = $dbr->query( $sql );
+                       $row = $result->fetchObject();
+                       $result->free();
+
+                       if( $row && $row->Variable_name == 'ft_min_word_len' ) {
+                               self::$mMinSearchLength = intval( $row->Value );
+                       } else {
+                               self::$mMinSearchLength = 0;
+                       }
+               }
+               return self::$mMinSearchLength;
+       }
+
        public function setBigSelects( $value = true ) {
                if ( $value === 'default' ) {
                        if ( $this->mDefaultBigSelects === null ) {