Enable UTF-8 lower/upper case operations in SearchEngine,
authorRobert Stojnić <rainman@users.mediawiki.org>
Wed, 19 Jul 2006 19:17:36 +0000 (19:17 +0000)
committerRobert Stojnić <rainman@users.mediawiki.org>
Wed, 19 Jul 2006 19:17:36 +0000 (19:17 +0000)
search in different variants (if needed).
Minor bug fixes for LanguageConverter: do no convert
roman numbers and text between <code></code> into
variants (e.g. cyrillic).

includes/SearchEngine.php
includes/SpecialSearch.php
languages/Language.php
languages/LanguageConverter.php
languages/LanguageSr.php
languages/LanguageUtf8.php

index c3b3851..6af1e41 100644 (file)
@@ -51,6 +51,7 @@ class SearchEngine {
         * @private
         */
        function getNearMatch( $term ) {
+               global $wgContLang;
                # Exact match? No need to look further.
                $title = Title::newFromText( $term );
                if (is_null($title))
@@ -62,33 +63,27 @@ class SearchEngine {
 
                # Now try all lower case (i.e. first letter capitalized)
                #
-               $title = Title::newFromText( strtolower( $term ) );
+               $title = Title::newFromText( $wgContLang->lc( $term ) );
                if ( $title->exists() ) {
                        return $title;
                }
 
                # Now try capitalized string
                #
-               $title = Title::newFromText( ucwords( strtolower( $term ) ) );
+               $title = Title::newFromText( $wgContLang->ucwords( $term ) );
                if ( $title->exists() ) {
                        return $title;
                }
 
                # Now try all upper case
                #
-               $title = Title::newFromText( strtoupper( $term ) );
+               $title = Title::newFromText( $wgContLang->uc( $term ) );
                if ( $title->exists() ) {
                        return $title;
                }
 
                # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
-               $title = Title::newFromText( preg_replace_callback(
-                       '/\b([\w\x80-\xff]+)\b/',
-                       create_function( '$matches', '
-                               global $wgContLang;
-                               return $wgContLang->ucfirst($matches[1]);
-                               ' ),
-                       $term ) );
+               $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
                if ( $title->exists() ) {
                        return $title;
                }
index c466985..a8aadfa 100644 (file)
@@ -77,6 +77,7 @@ class SpecialSearch {
        function goResult( $term ) {
                global $wgOut;
                global $wgGoToEdit;
+               global $wgContLang;
 
                $this->setupPage( $term );
 
@@ -96,6 +97,20 @@ class SpecialSearch {
                        return;
                }
 
+               # if language supports variants, search in all variants
+               if(sizeof($wgContLang->getVariants())>1){
+                       $allTermVariants = $wgContLang->convertLinkToAllVariants($term);
+
+                       foreach($allTermVariants as $termVariant){
+                               $t = SearchEngine::getNearMatch( $termVariant );
+                               if( !is_null( $t ) ) {
+                                       $wgOut->redirect( $t->getFullURL() );
+                                       wfProfileOut( $fname );
+                                       return;
+                               }
+                       }
+               }
+
                # No match, generate an edit URL
                $t = Title::newFromText( $term );
                if( is_null( $t ) ) {
index 650d5c3..f001ee6 100644 (file)
@@ -755,6 +755,21 @@ class Language {
                return strtolower( $str );
        }
 
+       function ucwords($str) {
+               return ucwords( strtolower( $str ) );
+       }
+
+  # capitalize words at word breaks
+       function ucwordbreaks($str){
+               return preg_replace_callback(
+                       '/\b([\w\x80-\xff]+)\b/',
+                       create_function( '$matches', '
+                               global $wgContLang;
+                               return $wgContLang->ucfirst($matches[1]);
+                               ' ),
+                       $str );
+       }
+
        function checkTitleEncoding( $s ) {
                global $wgInputEncoding;
 
index 575df85..afbde79 100644 (file)
@@ -120,6 +120,34 @@ class LanguageConverter {
                }
        }
 
+       /**
+        *  This function should be called on bare text
+        *  It translates text into variant, specials:
+        *    - ommiting roman numbers
+        */
+       function translateText($text, $toVariant){
+               $breaks = '[^\w\x80-\xff]';
+
+               // regexp for roman numbers
+               $roman = 'M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})';
+
+               $reg = '/^'.$roman.'$|^'.$roman.$breaks.'|'.$breaks.$roman.'$|'.$breaks.$roman.$breaks.'/';
+
+               $matches = preg_split($reg, $text, -1, PREG_SPLIT_OFFSET_CAPTURE);
+
+               
+               $m = array_shift($matches);
+               $ret = strtr($m[0], $this->mTables[$toVariant]);
+               $mstart = $m[1]+strlen($m[0]);
+               foreach($matches as $m) {
+                       $ret .= substr($text, $mstart, $m[1]-$mstart);
+                       $ret .= strtr($m[0], $this->mTables[$toVariant]);
+                       $mstart = $m[1] + strlen($m[0]);
+               }
+
+               return $ret;
+       }
+
        /**
      * dictionary-based conversion
      *
@@ -153,19 +181,21 @@ class LanguageConverter {
                        $marker = "";
 
                // this one is needed when the text is inside an html markup
-               $htmlfix = '|<[^>]+=\"[^(>=)]*$|^[^(<>=\")]*\"[^>]*>';
+               $htmlfix = '|<[^>]+$|^[^<>]*>';
+
+               // disable convert to variants between <code></code> tags
+               $codefix = '<code>.+?<\/code>|';
 
-               $reg = '/<[^>]+>|&[a-z#][a-z0-9]+;' . $marker . $htmlfix . '/';
+               $reg = '/'.$codefix.'<[^>]+>|&[a-z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
        
                $matches = preg_split($reg, $text, -1, PREG_SPLIT_OFFSET_CAPTURE);
 
-
                $m = array_shift($matches);
-               $ret = strtr($m[0], $this->mTables[$toVariant]);
+               $ret = $this->translateText($m[0],$toVariant);
                $mstart = $m[1]+strlen($m[0]);
                foreach($matches as $m) {
                        $ret .= substr($text, $mstart, $m[1]-$mstart);
-                       $ret .= strtr($m[0], $this->mTables[$toVariant]);
+                       $ret .= $this->translateText($m[0],$toVariant);
                        $mstart = $m[1] + strlen($m[0]);
                }
                wfProfileOut( $fname );
@@ -187,7 +217,7 @@ class LanguageConverter {
 
                $ret = array();
                foreach($this->mVariants as $variant) {
-                       $ret[$variant] = strtr($text, $this->mTables[$variant]);
+                       $ret[$variant] = $this->translateText($text,$variant);
                }
                if($includeFixedVariant)
                        $ret[$this->mMainLanguageCode.'-fixed'] = $this->mMarkup['begin'].$text.$this->mMarkup['end'];
@@ -212,7 +242,7 @@ class LanguageConverter {
                $tfirst = array_shift($tarray);
 
                foreach($this->mVariants as $variant)
-                       $ret[$variant] = strtr($tfirst, $this->mTables[$variant]);
+                       $ret[$variant] = $this->translateText($tfirst,$variant);
 
                foreach($tarray as $txt) {
                        $marked = explode($this->mMarkup['end'], $txt, 2);
@@ -220,7 +250,7 @@ class LanguageConverter {
                        foreach($this->mVariants as $variant){
                                $ret[$variant] .= $this->mMarkup['begin'].$marked[0].$this->mMarkup['end'];
                                if(array_key_exists(1, $marked))
-                                       $ret[$variant] .= strtr($marked[1], $this->mTables[$variant]);
+                                       $ret[$variant] .= $this->translateText($marked[1],$variant);
                        }
                        
                }
index 2eede4b..d9063e2 100644 (file)
@@ -196,14 +196,17 @@ class SrConverter extends LanguageConverter {
 class LanguageSr extends LanguageSr_ec {
        function __construct() {
                global $wgHooks;
-               $variants = array('sr', 'sr-ec', 'sr-jc', 'sr-el', 'sr-jl');
+
+               // these variants are currently UNUSED:
+               // 'sr-jc', 'sr-jl' 
+               $variants = array('sr', 'sr-ec', 'sr-el');
                $variantfallbacks = array(
                        'sr'    => 'sr-ec',
-                       'sr-ec' => 'sr-jc',
-                       'sr-jc' => 'sr-ec',
-                       'sr-el' => 'sr-jl',
-                       'sr-jl' => 'sr-el'
-               );
+                       'sr-ec' => 'sr-ec',
+                       'sr-el' => 'sr-el',
+                       ); 
+
+
                $marker = array();//don't mess with these, leave them as they are
                $flags = array(
                        'S' => 'S', 'писмо' => 'S', 'pismo' => 'S',
index d738624..c75dd7e 100644 (file)
@@ -94,6 +94,51 @@ class LanguageUtf8 extends Language {
                                return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
        }
 
+       function ucwords($str) {
+               global $wikiUpperChars;
+
+               if ( LanguageUtf8::isMultibyte( $str ) ) {
+                       $str = LanguageUtf8::lc($str);
+
+                       if ( function_exists( 'mb_strtoupper' ) )
+                               $replaceCall = "mb_strtoupper(\"\$0\")";
+                       else 
+                               $replaceCall = "strtr( \"\$0\" , \$wikiUpperChars )";
+
+                       return preg_replace(
+                                       "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)| ([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
+                                       $replaceCall,
+                                       $str
+                               );
+               }
+               else
+                       return ucwords( strtolower( $str ) );
+       }       
+
+       function ucwordbreaks($str){
+               global $wikiUpperChars;
+
+               if (LanguageUtf8::isMultibyte( $str ) ) {
+                       $str = LanguageUtf8::lc($str);
+
+                       if ( function_exists( 'mb_strtoupper' ) )
+                               $replaceCall = "mb_strtoupper(\"\$0\")";
+                       else 
+                               $replaceCall = "strtr( \"\$0\" , \$wikiUpperChars )";
+
+                       // since \b doesn't work for UTF-8, we explicitely define word break chars
+                       $breaks= "[ \-\(\)\}\{\.,\?!]";
+
+                       return preg_replace(
+                                       "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)|$breaks([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
+                                       $replaceCall,
+                                       $str
+                               );
+               }
+               else
+                       return Language::ucwordbreaks($str);
+       }
+
        function isMultibyte( $str ) {
                return (bool)preg_match( '/^[\x80-\xff]/', $str );
        }