search in different variants (if needed).
Minor bug fixes for LanguageConverter: do no convert
roman numbers and text between <code></code> into
variants (e.g. cyrillic).
* @private
*/
function getNearMatch( $term ) {
+ global $wgContLang;
# Exact match? No need to look further.
$title = Title::newFromText( $term );
if (is_null($title))
# Now try all lower case (i.e. first letter capitalized)
#
- $title = Title::newFromText( strtolower( $term ) );
+ $title = Title::newFromText( $wgContLang->lc( $term ) );
if ( $title->exists() ) {
return $title;
}
# Now try capitalized string
#
- $title = Title::newFromText( ucwords( strtolower( $term ) ) );
+ $title = Title::newFromText( $wgContLang->ucwords( $term ) );
if ( $title->exists() ) {
return $title;
}
# Now try all upper case
#
- $title = Title::newFromText( strtoupper( $term ) );
+ $title = Title::newFromText( $wgContLang->uc( $term ) );
if ( $title->exists() ) {
return $title;
}
# Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
- $title = Title::newFromText( preg_replace_callback(
- '/\b([\w\x80-\xff]+)\b/',
- create_function( '$matches', '
- global $wgContLang;
- return $wgContLang->ucfirst($matches[1]);
- ' ),
- $term ) );
+ $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
if ( $title->exists() ) {
return $title;
}
function goResult( $term ) {
global $wgOut;
global $wgGoToEdit;
+ global $wgContLang;
$this->setupPage( $term );
return;
}
+ # if language supports variants, search in all variants
+ if(sizeof($wgContLang->getVariants())>1){
+ $allTermVariants = $wgContLang->convertLinkToAllVariants($term);
+
+ foreach($allTermVariants as $termVariant){
+ $t = SearchEngine::getNearMatch( $termVariant );
+ if( !is_null( $t ) ) {
+ $wgOut->redirect( $t->getFullURL() );
+ wfProfileOut( $fname );
+ return;
+ }
+ }
+ }
+
# No match, generate an edit URL
$t = Title::newFromText( $term );
if( is_null( $t ) ) {
return strtolower( $str );
}
+ function ucwords($str) {
+ return ucwords( strtolower( $str ) );
+ }
+
+ # capitalize words at word breaks
+ function ucwordbreaks($str){
+ return preg_replace_callback(
+ '/\b([\w\x80-\xff]+)\b/',
+ create_function( '$matches', '
+ global $wgContLang;
+ return $wgContLang->ucfirst($matches[1]);
+ ' ),
+ $str );
+ }
+
function checkTitleEncoding( $s ) {
global $wgInputEncoding;
}
}
+ /**
+ * This function should be called on bare text
+ * It translates text into variant, specials:
+ * - ommiting roman numbers
+ */
+ function translateText($text, $toVariant){
+ $breaks = '[^\w\x80-\xff]';
+
+ // regexp for roman numbers
+ $roman = 'M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})';
+
+ $reg = '/^'.$roman.'$|^'.$roman.$breaks.'|'.$breaks.$roman.'$|'.$breaks.$roman.$breaks.'/';
+
+ $matches = preg_split($reg, $text, -1, PREG_SPLIT_OFFSET_CAPTURE);
+
+
+ $m = array_shift($matches);
+ $ret = strtr($m[0], $this->mTables[$toVariant]);
+ $mstart = $m[1]+strlen($m[0]);
+ foreach($matches as $m) {
+ $ret .= substr($text, $mstart, $m[1]-$mstart);
+ $ret .= strtr($m[0], $this->mTables[$toVariant]);
+ $mstart = $m[1] + strlen($m[0]);
+ }
+
+ return $ret;
+ }
+
/**
* dictionary-based conversion
*
$marker = "";
// this one is needed when the text is inside an html markup
- $htmlfix = '|<[^>]+=\"[^(>=)]*$|^[^(<>=\")]*\"[^>]*>';
+ $htmlfix = '|<[^>]+$|^[^<>]*>';
+
+ // disable convert to variants between <code></code> tags
+ $codefix = '<code>.+?<\/code>|';
- $reg = '/<[^>]+>|&[a-z#][a-z0-9]+;' . $marker . $htmlfix . '/';
+ $reg = '/'.$codefix.'<[^>]+>|&[a-z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
$matches = preg_split($reg, $text, -1, PREG_SPLIT_OFFSET_CAPTURE);
-
$m = array_shift($matches);
- $ret = strtr($m[0], $this->mTables[$toVariant]);
+ $ret = $this->translateText($m[0],$toVariant);
$mstart = $m[1]+strlen($m[0]);
foreach($matches as $m) {
$ret .= substr($text, $mstart, $m[1]-$mstart);
- $ret .= strtr($m[0], $this->mTables[$toVariant]);
+ $ret .= $this->translateText($m[0],$toVariant);
$mstart = $m[1] + strlen($m[0]);
}
wfProfileOut( $fname );
$ret = array();
foreach($this->mVariants as $variant) {
- $ret[$variant] = strtr($text, $this->mTables[$variant]);
+ $ret[$variant] = $this->translateText($text,$variant);
}
if($includeFixedVariant)
$ret[$this->mMainLanguageCode.'-fixed'] = $this->mMarkup['begin'].$text.$this->mMarkup['end'];
$tfirst = array_shift($tarray);
foreach($this->mVariants as $variant)
- $ret[$variant] = strtr($tfirst, $this->mTables[$variant]);
+ $ret[$variant] = $this->translateText($tfirst,$variant);
foreach($tarray as $txt) {
$marked = explode($this->mMarkup['end'], $txt, 2);
foreach($this->mVariants as $variant){
$ret[$variant] .= $this->mMarkup['begin'].$marked[0].$this->mMarkup['end'];
if(array_key_exists(1, $marked))
- $ret[$variant] .= strtr($marked[1], $this->mTables[$variant]);
+ $ret[$variant] .= $this->translateText($marked[1],$variant);
}
}
class LanguageSr extends LanguageSr_ec {
function __construct() {
global $wgHooks;
- $variants = array('sr', 'sr-ec', 'sr-jc', 'sr-el', 'sr-jl');
+
+ // these variants are currently UNUSED:
+ // 'sr-jc', 'sr-jl'
+ $variants = array('sr', 'sr-ec', 'sr-el');
$variantfallbacks = array(
'sr' => 'sr-ec',
- 'sr-ec' => 'sr-jc',
- 'sr-jc' => 'sr-ec',
- 'sr-el' => 'sr-jl',
- 'sr-jl' => 'sr-el'
- );
+ 'sr-ec' => 'sr-ec',
+ 'sr-el' => 'sr-el',
+ );
+
+
$marker = array();//don't mess with these, leave them as they are
$flags = array(
'S' => 'S', 'писмо' => 'S', 'pismo' => 'S',
return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str );
}
+ function ucwords($str) {
+ global $wikiUpperChars;
+
+ if ( LanguageUtf8::isMultibyte( $str ) ) {
+ $str = LanguageUtf8::lc($str);
+
+ if ( function_exists( 'mb_strtoupper' ) )
+ $replaceCall = "mb_strtoupper(\"\$0\")";
+ else
+ $replaceCall = "strtr( \"\$0\" , \$wikiUpperChars )";
+
+ return preg_replace(
+ "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)| ([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
+ $replaceCall,
+ $str
+ );
+ }
+ else
+ return ucwords( strtolower( $str ) );
+ }
+
+ function ucwordbreaks($str){
+ global $wikiUpperChars;
+
+ if (LanguageUtf8::isMultibyte( $str ) ) {
+ $str = LanguageUtf8::lc($str);
+
+ if ( function_exists( 'mb_strtoupper' ) )
+ $replaceCall = "mb_strtoupper(\"\$0\")";
+ else
+ $replaceCall = "strtr( \"\$0\" , \$wikiUpperChars )";
+
+ // since \b doesn't work for UTF-8, we explicitely define word break chars
+ $breaks= "[ \-\(\)\}\{\.,\?!]";
+
+ return preg_replace(
+ "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)|$breaks([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
+ $replaceCall,
+ $str
+ );
+ }
+ else
+ return Language::ucwordbreaks($str);
+ }
+
function isMultibyte( $str ) {
return (bool)preg_match( '/^[\x80-\xff]/', $str );
}