From f6dee6c1e75df2c06b2b280687591f19fd8dfa71 Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Mon, 18 Jan 2010 20:54:43 +0000 Subject: [PATCH] Factored MySQL-specific munging out of Language::stripForSearch() to DatabaseMysql. This will also allow other backends to provide seamlessly their own munging algorithms in the future. --- includes/db/Database.php | 12 ++++++ includes/db/DatabaseMysql.php | 80 +++++++++++++++++++++++++++++++++++ languages/Language.php | 75 +++----------------------------- 3 files changed, 97 insertions(+), 70 deletions(-) diff --git a/includes/db/Database.php b/includes/db/Database.php index 28c3c94648..dcda5d8795 100644 --- a/includes/db/Database.php +++ b/includes/db/Database.php @@ -2366,6 +2366,18 @@ abstract class DatabaseBase { return "SearchMySQL"; } + /** + * When overridden in derived class, performs database-specific conversions + * on text to be used for searching or updating search index. + * Default implementation does nothing (simply returns $string). + * + * @param $string string: String to strip + * @return string + */ + public function stripForSearch( $string ) { + return $string; + } + /** * Allow or deny "big selects" for this session only. This is done by setting * the sql_big_selects session variable. diff --git a/includes/db/DatabaseMysql.php b/includes/db/DatabaseMysql.php index 5ae4a222c7..63f267c7af 100644 --- a/includes/db/DatabaseMysql.php +++ b/includes/db/DatabaseMysql.php @@ -7,6 +7,8 @@ * @see Database */ class DatabaseMysql extends DatabaseBase { + static $mMinSearchLength; + function getType() { return 'mysql'; } @@ -367,6 +369,84 @@ class DatabaseMysql extends DatabaseBase { $this->query( "UNLOCK TABLES", $method ); } + /** + * Converts some characters for MySQL's indexing to grok it correctly, + * and pads short words to overcome limitations. + */ + function stripForSearch( $string ) { + global $wgContLang; + + wfProfileIn( __METHOD__ ); + + // MySQL fulltext index doesn't grok utf-8, so we + // need to fold cases and convert to hex + $out = preg_replace_callback( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/", + array( $this, 'stripForSearchCallback' ), + $wgContLang->lc( $string ) ); + + // And to add insult to injury, the default indexing + // ignores short words... Pad them so we can pass them + // through without reconfiguring the server... + $minLength = $this->minSearchLength(); + if( $minLength > 1 ) { + $n = $minLength - 1; + $out = preg_replace( + "/\b(\w{1,$n})\b/", + "$1u800", + $out ); + } + + // Periods within things like hostnames and IP addresses + // are also important -- we want a search for "example.com" + // or "192.168.1.1" to work sanely. + // + // MySQL's search seems to ignore them, so you'd match on + // "example.wikipedia.com" and "192.168.83.1" as well. + $out = preg_replace( + "/(\w)\.(\w|\*)/u", + "$1u82e$2", + $out ); + + wfProfileOut( __METHOD__ ); + + return $out; + } + + /** + * Armor a case-folded UTF-8 string to get through MySQL's + * fulltext search without being mucked up by funny charset + * settings or anything else of the sort. + */ + protected function stripForSearchCallback( $matches ) { + return 'u8' . bin2hex( $matches[1] ); + } + + /** + * Check MySQL server's ft_min_word_len setting so we know + * if we need to pad short words... + * + * @return int + */ + protected function minSearchLength() { + if( is_null( self::$mMinSearchLength ) ) { + $sql = "show global variables like 'ft\\_min\\_word\\_len'"; + + // Even though this query is pretty fast, let's not overload the master + $dbr = wfGetDB( DB_SLAVE ); + $result = $dbr->query( $sql ); + $row = $result->fetchObject(); + $result->free(); + + if( $row && $row->Variable_name == 'ft_min_word_len' ) { + self::$mMinSearchLength = intval( $row->Value ); + } else { + self::$mMinSearchLength = 0; + } + } + return self::$mMinSearchLength; + } + public function setBigSelects( $value = true ) { if ( $value === 'default' ) { if ( $this->mDefaultBigSelects === null ) { diff --git a/languages/Language.php b/languages/Language.php index 4cd4ffa12f..3ea96164d8 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -59,7 +59,6 @@ class Language { var $mNamespaceIds, $namespaceNames, $namespaceAliases; var $dateFormatStrings = array(); - var $minSearchLength; var $mExtendedSpecialPageAliases; /** @@ -1689,83 +1688,19 @@ class Language { } /** - * Some languages have special punctuation to strip out - * or characters which need to be converted for MySQL's - * indexing to grok it correctly. Make such changes here. + * Some languages have special punctuation to strip out. + * Make such changes here. * * @param $string String * @return String */ function stripForSearch( $string, $doStrip = true ) { - global $wgDBtype; - if ( $wgDBtype != 'mysql' || $doStrip == false ) { + if ( !$doStrip ) { return $string; } - wfProfileIn( __METHOD__ ); - - // MySQL fulltext index doesn't grok utf-8, so we - // need to fold cases and convert to hex - $out = preg_replace_callback( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/", - array( $this, 'stripForSearchCallback' ), - $this->lc( $string ) ); - - // And to add insult to injury, the default indexing - // ignores short words... Pad them so we can pass them - // through without reconfiguring the server... - $minLength = $this->minSearchLength(); - if( $minLength > 1 ) { - $n = $minLength-1; - $out = preg_replace( - "/\b(\w{1,$n})\b/", - "$1u800", - $out ); - } - - // Periods within things like hostnames and IP addresses - // are also important -- we want a search for "example.com" - // or "192.168.1.1" to work sanely. - // - // MySQL's search seems to ignore them, so you'd match on - // "example.wikipedia.com" and "192.168.83.1" as well. - $out = preg_replace( - "/(\w)\.(\w|\*)/u", - "$1u82e$2", - $out ); - - wfProfileOut( __METHOD__ ); - return $out; - } - - /** - * Armor a case-folded UTF-8 string to get through MySQL's - * fulltext search without being mucked up by funny charset - * settings or anything else of the sort. - */ - protected function stripForSearchCallback( $matches ) { - return 'u8' . bin2hex( $matches[1] ); - } - - /** - * Check MySQL server's ft_min_word_len setting so we know - * if we need to pad short words... - */ - protected function minSearchLength() { - if( is_null( $this->minSearchLength ) ) { - $sql = "show global variables like 'ft\\_min\\_word\\_len'"; - $dbr = wfGetDB( DB_SLAVE ); - $result = $dbr->query( $sql ); - $row = $result->fetchObject(); - $result->free(); - - if( $row && $row->Variable_name == 'ft_min_word_len' ) { - $this->minSearchLength = intval( $row->Value ); - } else { - $this->minSearchLength = 0; - } - } - return $this->minSearchLength; + $dbr = wfGetDB( DB_SLAVE ); + return $dbr->stripForSearch( $string ); } /** -- 2.20.1