* (bug 8445) Multiple-character search terms are now handled properly for Chinese

author Brion Vibber <brion@users.mediawiki.org>

Wed, 24 Jun 2009 02:27:51 +0000 (02:27 +0000)

committer Brion Vibber <brion@users.mediawiki.org>

Wed, 24 Jun 2009 02:27:51 +0000 (02:27 +0000)
author Brion Vibber <brion@users.mediawiki.org>
Wed, 24 Jun 2009 02:27:51 +0000 (02:27 +0000)
committer Brion Vibber <brion@users.mediawiki.org>
Wed, 24 Jun 2009 02:27:51 +0000 (02:27 +0000)
diff --git a/RELEASE-NOTES b/RELEASE-NOTES

index b4a773b..560045e 100644 (file)
--- a/RELEASE-NOTES
+++ b/RELEASE-NOTES
@@ -206,6 +206,7 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
    via extensions not using the userCan hook and via $wgRevokePermissions now work.
  * (bug 19157) createAndPromote error on bad password
  * (bug 18768) Remove AdminSettings.php from MediaWiki core
+* (bug 8445) Multiple-character search terms are now handled properly for Chinese
  
  == API changes in 1.16 ==
  
diff --git a/includes/SearchMySQL.php b/includes/SearchMySQL.php

index 78aa60f..6709033 100644 (file)
--- a/includes/SearchMySQL.php
+++ b/includes/SearchMySQL.php
@@ -48,45 +48,94 @@ class SearchMySQL extends SearchEngine {
                 $m = array();
                 if( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/',
                           $filteredText, $m, PREG_SET_ORDER ) ) {
-                       foreach( $m as $terms ) {
+                       foreach( $m as $bits ) {
+                               @list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits;
+                               
+                               if( $nonQuoted != '' ) {
+                                       $term = $nonQuoted;
+                                       $quote = '';
+                               } else {
+                                       $term = str_replace( '"', '', $term );
+                                       $quote = '"';
+                               }
+                       
                                 if( $searchon !== '' ) $searchon .= ' ';
-                               if( $this->strictMatching && ($terms[1] == '') ) {
-                                       $terms[1] = '+';
+                               if( $this->strictMatching && ($modifier == '') ) {
+                                       // If we leave this out, boolean op defaults to OR which is rarely helpful.
+                                       $modifier = '+';
                                 }
-                               // Search terms in all variant forms, only
-                               // apply on wiki with LanguageConverter
-                               $temp_terms = $wgContLang->autoConvertToAllVariants( $terms[2] );
-                               if( is_array( $temp_terms )) {
-                                       $temp_terms = array_unique( array_values( $temp_terms ));
-                                       foreach( $temp_terms as $t )
-                                               $searchon .= $terms[1] . $wgContLang->stripForSearch( $t ) . ' ';
+                               
+                               // Some languages such as Serbian store the input form in the search index,
+                               // so we may need to search for matches in multiple writing system variants.
+                               $convertedVariants = $wgContLang->autoConvertToAllVariants( $term );
+                               if( is_array( $convertedVariants ) ) {
+                                       $variants = array_unique( array_values( $convertedVariants ) );
+                               } else {
+                                       $variants = array( $term );
                                 }
-                               else
-                                       $searchon .= $terms[1] . $wgContLang->stripForSearch( $terms[2] );
-                               if( !empty( $terms[3] ) ) {
-                                       // Match individual terms in result highlighting...
-                                       $regexp = preg_quote( $terms[3], '/' );
-                                       if( $terms[4] ) {
-                                               $regexp = "\b$regexp"; // foo*
-                                       } else {
-                                               $regexp = "\b$regexp\b";
+                               
+                               // The low-level search index does some processing on input to work
+                               // around problems with minimum lengths and encoding in MySQL's
+                               // fulltext engine.
+                               // For Chinese this also inserts spaces between adjacent Han characters.
+                               $strippedVariants = array_map(
+                                       array( $wgContLang, 'stripForSearch' ),
+                                       $variants );
+                               
+                               // Some languages such as Chinese force all variants to a canonical
+                               // form when stripping to the low-level search index, so to be sure
+                               // let's check our variants list for unique items after stripping.
+                               $strippedVariants = array_unique( $strippedVariants );
+                               
+                               $searchon .= $modifier;
+                               if( count( $strippedVariants) > 1 )
+                                       $searchon .= '(';
+                               foreach( $strippedVariants as $stripped ) {
+                                       if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
+                                               // Hack for Chinese: we need to toss in quotes for
+                                               // multiple-character phrases since stripForSearch()
+                                               // added spaces between them to make word breaks.
+                                               $stripped = '"' . trim( $stripped ) . '"';
                                         }
-                               } else {
-                                       // Match the quoted term in result highlighting...
-                                       $regexp = preg_quote( str_replace( '"', '', $terms[2] ), '/' );
+                                       $searchon .= "$quote$stripped$quote$wildcard ";
                                 }
+                               if( count( $strippedVariants) > 1 )
+                                       $searchon .= ')';
+                               
+                               // Match individual terms or quoted phrase in result highlighting...
+                               // Note that variants will be introduced in a later stage for highlighting!
+                               $regexp = $this->regexTerm( $term, $wildcard );
                                 $this->searchTerms[] = $regexp;
                         }
-                       wfDebug( "Would search with '$searchon'\n" );
-                       wfDebug( 'Match with /' . implode( '|', $this->searchTerms ) . "/\n" );
+                       wfDebug( __METHOD__ . ": Would search with '$searchon'\n" );
+                       wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" );
                 } else {
-                       wfDebug( "Can't understand search query '{$filteredText}'\n" );
+                       wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" );
                 }
  
                 $searchon = $this->db->strencode( $searchon );
                 $field = $this->getIndexField( $fulltext );
                 return " MATCH($field) AGAINST('$searchon' IN BOOLEAN MODE) ";
         }
+       
+       function regexTerm( $string, $wildcard ) {
+               global $wgContLang;
+               
+               $regex = preg_quote( $string, '/' );
+               if( $wgContLang->hasWordBreaks() ) {
+                       if( $wildcard ) {
+                               // Don't cut off the final bit!
+                               $regex = "\b$regex";
+                       } else {
+                               $regex = "\b$regex\b";
+                       }
+               } else {
+                       // For Chinese, words may legitimately abut other words in the text literal.
+                       // Don't add \b boundary checks... note this could cause false positives
+                       // for latin chars.
+               }
+               return $regex;
+       }
  
         public static function legalSearchChars() {
                 return "\"*" . parent::legalSearchChars();
diff --git a/languages/Language.php b/languages/Language.php

index e0fb273..65c62a3 100644 (file)
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1594,6 +1594,16 @@ class Language {
                 return $this->fallback8bitEncoding;
         }
         
+       /**
+        * Most writing systems use whitespace to break up words.
+        * Some languages such as Chinese don't conventionally do this,
+        * which requires special handling when breaking up words for
+        * searching etc.
+        */
+       function hasWordBreaks() {
+               return true;
+       }
+       
         /**
          * Some languages have special punctuation to strip out
          * or characters which need to be converted for MySQL's
diff --git a/languages/classes/LanguageZh.php b/languages/classes/LanguageZh.php

index cbb748c..d937d58 100644 (file)
--- a/languages/classes/LanguageZh.php
+++ b/languages/classes/LanguageZh.php
@@ -175,19 +175,12 @@ class LanguageZh extends LanguageZh_hans {
         function stripForSearch( $string ) {
                 wfProfileIn( __METHOD__ );
  
-               // eventually this should be a word segmentation
-               // for now just treat each character as a word
-               // @fixme only do this for Han characters...
-               $t = preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
-                               " $1", $string);
-
          //always convert to zh-hans before indexing. it should be
                 //better to use zh-hans for search, since conversion from
                 //Traditional to Simplified is less ambiguous than the
                 //other way around
  
-               $t = $this->mConverter->autoConvert($t, 'zh-hans');
+               $t = $this->mConverter->autoConvert( $string, 'zh-hans' );
                 $t = parent::stripForSearch( $t );
                 wfProfileOut( __METHOD__ );
                 return $t;
diff --git a/languages/classes/LanguageZh_hans.php b/languages/classes/LanguageZh_hans.php

index 983dd48..081737c 100644 (file)
--- a/languages/classes/LanguageZh_hans.php
+++ b/languages/classes/LanguageZh_hans.php
@@ -4,21 +4,23 @@
   * @ingroup Language
   */
  class LanguageZh_hans extends Language {
+       function hasWordBreaks() {
+               return false;
+       }
+       
         function stripForSearch( $string ) {
-               # MySQL fulltext index doesn't grok utf-8, so we
-               # need to fold cases and convert to hex
-               # we also separate characters as "words"
-               if( function_exists( 'mb_strtolower' ) ) {
-                       return preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( \"$1\" )",
-                               mb_strtolower( $string ) );
-               } else {
-                       list( , $wikiLowerChars ) = Language::getCaseMaps();
-                       return preg_replace(
-                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
-                               "' U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
-                               $string );
-               }
+               // Eventually this should be a word segmentation;
+               // for now just treat each character as a word.
+               //
+               // Note we put a space on both sides to cover cases
+               // where a number or Latin char follows a Han char.
+               //
+               // @fixme only do this for Han characters...
+               $t = preg_replace(
+                               "/([\\xc0-\\xff][\\x80-\\xbf]*)/",
+                               " $1 ", $string);
+               $t = preg_replace( '/ +/', ' ', $t );
+               $t = trim( $t );
+               return parent::stripForSearch( $t );
         }
  }
author	Brion Vibber <brion@users.mediawiki.org>
	Wed, 24 Jun 2009 02:27:51 +0000 (02:27 +0000)
committer	Brion Vibber <brion@users.mediawiki.org>
	Wed, 24 Jun 2009 02:27:51 +0000 (02:27 +0000)
RELEASE-NOTES		patch \| blob \| history
includes/SearchMySQL.php		patch \| blob \| history
languages/Language.php		patch \| blob \| history
languages/classes/LanguageZh.php		patch \| blob \| history
languages/classes/LanguageZh_hans.php		patch \| blob \| history