From: Brion Vibber Date: Fri, 9 Jun 2006 23:23:37 +0000 (+0000) Subject: Revert bad patch for (bug 2069) Merge the LanguageUtf8 class into the Language class X-Git-Tag: 1.31.0-rc.0~56813 X-Git-Url: https://git.cyclocoop.org/%242?a=commitdiff_plain;h=d68b2efe5d485c4528d477325bd2e2c11651e1a6;p=lhc%2Fweb%2Fwiklou.git Revert bad patch for (bug 2069) Merge the LanguageUtf8 class into the Language class --- diff --git a/RELEASE-NOTES b/RELEASE-NOTES index d056362eda..3f7a2bfbe4 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -464,7 +464,6 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN * (bug 6243) Fix email for usernames containing dots when using PEAR::Mail * Remove a number of needless {{ns:project}}-type transforms from messages files. These usages already have separate label text. Such transforms are wasteful on each page view. -* (bug 2069) Merge the LanguageUtf8 class into the Language class * Update to Yiddish localization (yi) * (bug 6254) Update to Indonesian translation (id) #20 * (bug 6255) Fix transclusions starting with "#" or "*" in HTML attributes diff --git a/languages/Language.php b/languages/Language.php index 7779a34455..841fa55823 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -732,73 +732,41 @@ class Language { return iconv( $in, $out, $string ); } - function ucfirst( $str ) { - return $this->uc( $str, true ); - } - - function uc( $str, $first = false ) { - if ( function_exists( 'mb_strtoupper' ) ) - if ( $first ) - if ( $this->isMultibyte( $str ) ) - return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); - else - return ucfirst( $str ); - else - return $this->isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str ); - else - if ( $this->isMultibyte( $str ) ) { - global $wikiUpperChars; - $x = $first ? '^' : ''; - return preg_replace( - "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", - "strtr( \"\$1\" , \$wikiUpperChars )", - $str - ); - } else - return $first ? ucfirst( $str ) : strtoupper( $str ); - } - - function lcfirst( $str ) { - return $this->lc( $str, true ); - } - - function lc( $str, $first = false ) { - if ( function_exists( 'mb_strtolower' ) ) - if ( $first ) - if ( $this->isMultibyte( $str ) ) - return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); - else - return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ); - else - return $this->isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str ); - else - if ( $this->isMultibyte( $str ) ) { - global $wikiLowerChars; - $x = $first ? '^' : ''; - return preg_replace( - "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", - "strtr( \"\$1\" , \$wikiLowerChars )", - $str - ); - } else - return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str ); + function ucfirst( $string ) { + # For most languages, this is a wrapper for ucfirst() + return ucfirst( $string ); + } + + function uc( $str ) { + return strtoupper( $str ); + } + + function lcfirst( $s ) { + return strtolower( $s{0} ). substr( $s, 1 ); + } + + function lc( $str ) { + return strtolower( $str ); } function checkTitleEncoding( $s ) { global $wgInputEncoding; - if( is_array( $s ) ) { - wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' ); - } - # Check for non-UTF-8 URLs + # Check for UTF-8 URLs; Internet Explorer produces these if you + # type non-ASCII chars in the URL bar or follow unescaped links. $ishigh = preg_match( '/[\x80-\xff]/', $s); - if(!$ishigh) return $s; + $isutf = ($ishigh ? preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ) : true ); - $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); - if( $isutf8 ) return $s; + if( ($wgInputEncoding != 'utf-8') and $ishigh and $isutf ) + return @iconv( 'UTF-8', $wgInputEncoding, $s ); - return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s ); + if( ($wgInputEncoding == 'utf-8') and $ishigh and !$isutf ) + return utf8_encode( $s ); + + # Other languages can safely leave this function, or replace + # it with one to detect and convert another legacy encoding. + return $s; } /** @@ -806,33 +774,11 @@ class Language { * or characters which need to be converted for MySQL's * indexing to grok it correctly. Make such changes here. * - * @param string $string + * @param string $in * @return string */ - function stripForSearch( $string ) { - # MySQL fulltext index doesn't grok utf-8, so we - # need to fold cases and convert to hex - - # In Language:: it just returns lowercase, maybe - # all strtolower on stripped output or argument - # should be removed and all stripForSearch - # methods adjusted to that. - - wfProfileIn( "Language::stripForSearch" ); - if( function_exists( 'mb_strtolower' ) ) { - $out = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "'U8' . bin2hex( \"$1\" )", - mb_strtolower( $string ) ); - } else { - global $wikiLowerChars; - $out = preg_replace( - "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", - "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", - $string ); - } - wfProfileOut( "Language::stripForSearch" ); - return $out; + function stripForSearch( $in ) { + return strtolower( $in ); } function convertForSearchResult( $termsArray ) { @@ -850,10 +796,7 @@ class Language { * @return string */ function firstChar( $s ) { - preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches); - - return isset( $matches[1] ) ? $matches[1] : ""; + return $s[0]; } function initEncoding() { @@ -1038,7 +981,7 @@ class Language { # # $length does not include the optional ellipsis. # If $length is negative, snip from the beginning - function truncate( $string, $length, $ellipsis = "" ) { + function truncate( $string, $length, $ellipsis = '' ) { if( $length == 0 ) { return $ellipsis; } @@ -1047,24 +990,9 @@ class Language { } if( $length > 0 ) { $string = substr( $string, 0, $length ); - $char = ord( $string[strlen( $string ) - 1] ); - if ($char >= 0xc0) { - # We got the first byte only of a multibyte char; remove it. - $string = substr( $string, 0, -1 ); - } elseif( $char >= 0x80 && - preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . - '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) { - # We chopped in the middle of a character; remove it - $string = $m[1]; - } return $string . $ellipsis; } else { $string = substr( $string, $length ); - $char = ord( $string[0] ); - if( $char >= 0x80 && $char < 0xc0 ) { - # We chopped in the middle of a character; remove the whole thing - $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); - } return $ellipsis . $string; } } @@ -1262,33 +1190,12 @@ class Language { return str_replace( '_', '-', strtolower( substr( get_class( $this ), 8 ) ) ); } - function isMultibyte( $str ) { - return (bool)preg_match( '/^[\x80-\xff]/', $str ); - } - function fallback8bitEncoding() { - # Windows codepage 1252 is a superset of iso 8859-1 - # override this to use difference source encoding to - # translate incoming 8-bit URLs. - return "windows-1252"; - } } -if( function_exists( 'mb_strtoupper' ) ) { - mb_internal_encoding('UTF-8'); -} else { - # Hack our own case conversion routines - - # Loading serialized arrays is faster than parsing code :P - $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" ); - $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" ); - - if(empty( $wikiUpperChars) || empty($wikiLowerChars )) { - require_once( "includes/Utf8Case.php" ); - $wgMemc->set( $key1, $wikiUpperChars ); - $wgMemc->set( $key2, $wikiLowerChars ); - } -} +# FIXME: Merge all UTF-8 support code into Language base class. +# We no longer support Latin-1 charset. +require_once( 'LanguageUtf8.php' ); # This should fail gracefully if there's not a localization available wfSuppressWarnings(); diff --git a/languages/LanguageUtf8.php b/languages/LanguageUtf8.php index fe05936bf6..d738624b77 100644 --- a/languages/LanguageUtf8.php +++ b/languages/LanguageUtf8.php @@ -1,12 +1,199 @@ get( $key1 = "$wgDBname:utf8:upper" ); + $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" ); + + if(empty( $wikiUpperChars) || empty($wikiLowerChars )) { + require_once( "includes/Utf8Case.php" ); + $wgMemc->set( $key1, $wikiUpperChars ); + $wgMemc->set( $key2, $wikiLowerChars ); + } +} + /** * Base stuff useful to all UTF-8 based language files * @package MediaWiki - * - * Will be deleted */ class LanguageUtf8 extends Language { + # These functions use mbstring library, if it is loaded + # or compiled and character mapping arrays otherwise. + # In case of language-specific character mismatch + # it should be dealt with in Language classes. + + function ucfirst( $str ) { + return LanguageUtf8::uc( $str, true ); + } + + function uc( $str, $first = false ) { + if ( function_exists( 'mb_strtoupper' ) ) + if ( $first ) + if ( LanguageUtf8::isMultibyte( $str ) ) + return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); + else + return ucfirst( $str ); + else + return LanguageUtf8::isMultibyte( $str ) ? mb_strtoupper( $str ) : strtoupper( $str ); + else + if ( LanguageUtf8::isMultibyte( $str ) ) { + global $wikiUpperChars; + $x = $first ? '^' : ''; + return preg_replace( + "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", + "strtr( \"\$1\" , \$wikiUpperChars )", + $str + ); + } else + return $first ? ucfirst( $str ) : strtoupper( $str ); + } + + function lcfirst( $str ) { + return LanguageUtf8::lc( $str, true ); + } + + function lc( $str, $first = false ) { + if ( function_exists( 'mb_strtolower' ) ) + if ( $first ) + if ( LanguageUtf8::isMultibyte( $str ) ) + return mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ); + else + return strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ); + else + return LanguageUtf8::isMultibyte( $str ) ? mb_strtolower( $str ) : strtolower( $str ); + else + if ( LanguageUtf8::isMultibyte( $str ) ) { + global $wikiLowerChars; + $x = $first ? '^' : ''; + return preg_replace( + "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e", + "strtr( \"\$1\" , \$wikiLowerChars )", + $str + ); + } else + return $first ? strtolower( substr( $str, 0, 1 ) ) . substr( $str, 1 ) : strtolower( $str ); + } + + function isMultibyte( $str ) { + return (bool)preg_match( '/^[\x80-\xff]/', $str ); + } + + function stripForSearch( $string ) { + # MySQL fulltext index doesn't grok utf-8, so we + # need to fold cases and convert to hex + + # In Language:: it just returns lowercase, maybe + # all strtolower on stripped output or argument + # should be removed and all stripForSearch + # methods adjusted to that. + + wfProfileIn( "LanguageUtf8::stripForSearch" ); + if( function_exists( 'mb_strtolower' ) ) { + $out = preg_replace( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", + "'U8' . bin2hex( \"$1\" )", + mb_strtolower( $string ) ); + } else { + global $wikiLowerChars; + $out = preg_replace( + "/([\\xc0-\\xff][\\x80-\\xbf]*)/e", + "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )", + $string ); + } + wfProfileOut( "LanguageUtf8::stripForSearch" ); + return $out; + } + + function fallback8bitEncoding() { + # Windows codepage 1252 is a superset of iso 8859-1 + # override this to use difference source encoding to + # translate incoming 8-bit URLs. + return "windows-1252"; + } + + function checkTitleEncoding( $s ) { + global $wgInputEncoding; + + if( is_array( $s ) ) { + wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' ); + } + # Check for non-UTF-8 URLs + $ishigh = preg_match( '/[\x80-\xff]/', $s); + if(!$ishigh) return $s; + + $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); + if( $isutf8 ) return $s; + + return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s ); + } + + function firstChar( $s ) { + preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches); + + return isset( $matches[1] ) ? $matches[1] : ""; + } + + # Crop a string from the beginning or end to a certain number of bytes. + # (Bytes are used because our storage has limited byte lengths for some + # columns in the database.) Multibyte charsets will need to make sure that + # only whole characters are included! + # + # $length does not include the optional ellipsis. + # If $length is negative, snip from the beginning + function truncate( $string, $length, $ellipsis = "" ) { + if( $length == 0 ) { + return $ellipsis; + } + if ( strlen( $string ) <= abs( $length ) ) { + return $string; + } + if( $length > 0 ) { + $string = substr( $string, 0, $length ); + $char = ord( $string[strlen( $string ) - 1] ); + if ($char >= 0xc0) { + # We got the first byte only of a multibyte char; remove it. + $string = substr( $string, 0, -1 ); + } elseif( $char >= 0x80 && + preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . + '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) { + # We chopped in the middle of a character; remove it + $string = $m[1]; + } + return $string . $ellipsis; + } else { + $string = substr( $string, $length ); + $char = ord( $string[0] ); + if( $char >= 0x80 && $char < 0xc0 ) { + # We chopped in the middle of a character; remove the whole thing + $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); + } + return $ellipsis . $string; + } + } } +} # ifdef MEDIAWIKI + ?>