From 83da52c540500a539fef803b24da1728492c6c20 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Robert=20Stojni=C4=87?= Date: Wed, 19 Jul 2006 19:17:36 +0000 Subject: [PATCH] Enable UTF-8 lower/upper case operations in SearchEngine, search in different variants (if needed). Minor bug fixes for LanguageConverter: do no convert roman numbers and text between into variants (e.g. cyrillic). --- includes/SearchEngine.php | 15 ++++------- includes/SpecialSearch.php | 15 +++++++++++ languages/Language.php | 15 +++++++++++ languages/LanguageConverter.php | 46 +++++++++++++++++++++++++++------ languages/LanguageSr.php | 15 ++++++----- languages/LanguageUtf8.php | 45 ++++++++++++++++++++++++++++++++ 6 files changed, 127 insertions(+), 24 deletions(-) diff --git a/includes/SearchEngine.php b/includes/SearchEngine.php index c3b38519a7..6af1e41596 100644 --- a/includes/SearchEngine.php +++ b/includes/SearchEngine.php @@ -51,6 +51,7 @@ class SearchEngine { * @private */ function getNearMatch( $term ) { + global $wgContLang; # Exact match? No need to look further. $title = Title::newFromText( $term ); if (is_null($title)) @@ -62,33 +63,27 @@ class SearchEngine { # Now try all lower case (i.e. first letter capitalized) # - $title = Title::newFromText( strtolower( $term ) ); + $title = Title::newFromText( $wgContLang->lc( $term ) ); if ( $title->exists() ) { return $title; } # Now try capitalized string # - $title = Title::newFromText( ucwords( strtolower( $term ) ) ); + $title = Title::newFromText( $wgContLang->ucwords( $term ) ); if ( $title->exists() ) { return $title; } # Now try all upper case # - $title = Title::newFromText( strtoupper( $term ) ); + $title = Title::newFromText( $wgContLang->uc( $term ) ); if ( $title->exists() ) { return $title; } # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc - $title = Title::newFromText( preg_replace_callback( - '/\b([\w\x80-\xff]+)\b/', - create_function( '$matches', ' - global $wgContLang; - return $wgContLang->ucfirst($matches[1]); - ' ), - $term ) ); + $title = Title::newFromText( $wgContLang->ucwordbreaks($term) ); if ( $title->exists() ) { return $title; } diff --git a/includes/SpecialSearch.php b/includes/SpecialSearch.php index c4669854d6..a8aadfa0f0 100644 --- a/includes/SpecialSearch.php +++ b/includes/SpecialSearch.php @@ -77,6 +77,7 @@ class SpecialSearch { function goResult( $term ) { global $wgOut; global $wgGoToEdit; + global $wgContLang; $this->setupPage( $term ); @@ -96,6 +97,20 @@ class SpecialSearch { return; } + # if language supports variants, search in all variants + if(sizeof($wgContLang->getVariants())>1){ + $allTermVariants = $wgContLang->convertLinkToAllVariants($term); + + foreach($allTermVariants as $termVariant){ + $t = SearchEngine::getNearMatch( $termVariant ); + if( !is_null( $t ) ) { + $wgOut->redirect( $t->getFullURL() ); + wfProfileOut( $fname ); + return; + } + } + } + # No match, generate an edit URL $t = Title::newFromText( $term ); if( is_null( $t ) ) { diff --git a/languages/Language.php b/languages/Language.php index 650d5c3fe1..f001ee6f5b 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -755,6 +755,21 @@ class Language { return strtolower( $str ); } + function ucwords($str) { + return ucwords( strtolower( $str ) ); + } + + # capitalize words at word breaks + function ucwordbreaks($str){ + return preg_replace_callback( + '/\b([\w\x80-\xff]+)\b/', + create_function( '$matches', ' + global $wgContLang; + return $wgContLang->ucfirst($matches[1]); + ' ), + $str ); + } + function checkTitleEncoding( $s ) { global $wgInputEncoding; diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index 575df85289..afbde79032 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -120,6 +120,34 @@ class LanguageConverter { } } + /** + * This function should be called on bare text + * It translates text into variant, specials: + * - ommiting roman numbers + */ + function translateText($text, $toVariant){ + $breaks = '[^\w\x80-\xff]'; + + // regexp for roman numbers + $roman = 'M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})'; + + $reg = '/^'.$roman.'$|^'.$roman.$breaks.'|'.$breaks.$roman.'$|'.$breaks.$roman.$breaks.'/'; + + $matches = preg_split($reg, $text, -1, PREG_SPLIT_OFFSET_CAPTURE); + + + $m = array_shift($matches); + $ret = strtr($m[0], $this->mTables[$toVariant]); + $mstart = $m[1]+strlen($m[0]); + foreach($matches as $m) { + $ret .= substr($text, $mstart, $m[1]-$mstart); + $ret .= strtr($m[0], $this->mTables[$toVariant]); + $mstart = $m[1] + strlen($m[0]); + } + + return $ret; + } + /** * dictionary-based conversion * @@ -153,19 +181,21 @@ class LanguageConverter { $marker = ""; // this one is needed when the text is inside an html markup - $htmlfix = '|<[^>]+=\"[^(>=)]*$|^[^(<>=\")]*\"[^>]*>'; + $htmlfix = '|<[^>]+$|^[^<>]*>'; + + // disable convert to variants between tags + $codefix = '.+?<\/code>|'; - $reg = '/<[^>]+>|&[a-z#][a-z0-9]+;' . $marker . $htmlfix . '/'; + $reg = '/'.$codefix.'<[^>]+>|&[a-z#][a-z0-9]+;' . $marker . $htmlfix . '/s'; $matches = preg_split($reg, $text, -1, PREG_SPLIT_OFFSET_CAPTURE); - $m = array_shift($matches); - $ret = strtr($m[0], $this->mTables[$toVariant]); + $ret = $this->translateText($m[0],$toVariant); $mstart = $m[1]+strlen($m[0]); foreach($matches as $m) { $ret .= substr($text, $mstart, $m[1]-$mstart); - $ret .= strtr($m[0], $this->mTables[$toVariant]); + $ret .= $this->translateText($m[0],$toVariant); $mstart = $m[1] + strlen($m[0]); } wfProfileOut( $fname ); @@ -187,7 +217,7 @@ class LanguageConverter { $ret = array(); foreach($this->mVariants as $variant) { - $ret[$variant] = strtr($text, $this->mTables[$variant]); + $ret[$variant] = $this->translateText($text,$variant); } if($includeFixedVariant) $ret[$this->mMainLanguageCode.'-fixed'] = $this->mMarkup['begin'].$text.$this->mMarkup['end']; @@ -212,7 +242,7 @@ class LanguageConverter { $tfirst = array_shift($tarray); foreach($this->mVariants as $variant) - $ret[$variant] = strtr($tfirst, $this->mTables[$variant]); + $ret[$variant] = $this->translateText($tfirst,$variant); foreach($tarray as $txt) { $marked = explode($this->mMarkup['end'], $txt, 2); @@ -220,7 +250,7 @@ class LanguageConverter { foreach($this->mVariants as $variant){ $ret[$variant] .= $this->mMarkup['begin'].$marked[0].$this->mMarkup['end']; if(array_key_exists(1, $marked)) - $ret[$variant] .= strtr($marked[1], $this->mTables[$variant]); + $ret[$variant] .= $this->translateText($marked[1],$variant); } } diff --git a/languages/LanguageSr.php b/languages/LanguageSr.php index 2eede4b632..d9063e20ca 100644 --- a/languages/LanguageSr.php +++ b/languages/LanguageSr.php @@ -196,14 +196,17 @@ class SrConverter extends LanguageConverter { class LanguageSr extends LanguageSr_ec { function __construct() { global $wgHooks; - $variants = array('sr', 'sr-ec', 'sr-jc', 'sr-el', 'sr-jl'); + + // these variants are currently UNUSED: + // 'sr-jc', 'sr-jl' + $variants = array('sr', 'sr-ec', 'sr-el'); $variantfallbacks = array( 'sr' => 'sr-ec', - 'sr-ec' => 'sr-jc', - 'sr-jc' => 'sr-ec', - 'sr-el' => 'sr-jl', - 'sr-jl' => 'sr-el' - ); + 'sr-ec' => 'sr-ec', + 'sr-el' => 'sr-el', + ); + + $marker = array();//don't mess with these, leave them as they are $flags = array( 'S' => 'S', 'писмо' => 'S', 'pismo' => 'S', diff --git a/languages/LanguageUtf8.php b/languages/LanguageUtf8.php index d738624b77..c75dd7e6ee 100644 --- a/languages/LanguageUtf8.php +++ b/languages/LanguageUtf8.php @@ -94,6 +94,51 @@ class LanguageUtf8 extends Language { return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str ); } + function ucwords($str) { + global $wikiUpperChars; + + if ( LanguageUtf8::isMultibyte( $str ) ) { + $str = LanguageUtf8::lc($str); + + if ( function_exists( 'mb_strtoupper' ) ) + $replaceCall = "mb_strtoupper(\"\$0\")"; + else + $replaceCall = "strtr( \"\$0\" , \$wikiUpperChars )"; + + return preg_replace( + "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)| ([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", + $replaceCall, + $str + ); + } + else + return ucwords( strtolower( $str ) ); + } + + function ucwordbreaks($str){ + global $wikiUpperChars; + + if (LanguageUtf8::isMultibyte( $str ) ) { + $str = LanguageUtf8::lc($str); + + if ( function_exists( 'mb_strtoupper' ) ) + $replaceCall = "mb_strtoupper(\"\$0\")"; + else + $replaceCall = "strtr( \"\$0\" , \$wikiUpperChars )"; + + // since \b doesn't work for UTF-8, we explicitely define word break chars + $breaks= "[ \-\(\)\}\{\.,\?!]"; + + return preg_replace( + "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)|$breaks([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", + $replaceCall, + $str + ); + } + else + return Language::ucwordbreaks($str); + } + function isMultibyte( $str ) { return (bool)preg_match( '/^[\x80-\xff]/', $str ); } -- 2.20.1