7 if( defined( "MEDIAWIKI" ) ) {
9 # This file and LanguageLatin1.php may be included from within functions, so
10 # we need to have global statements
12 global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
13 global $wgDBname, $wgMemc;
15 $wgInputEncoding = "UTF-8";
16 $wgOutputEncoding = "UTF-8";
18 if( function_exists( 'mb_strtoupper' ) ) {
19 mb_internal_encoding('UTF-8');
21 # Hack our own case conversion routines
23 # Loading serialized arrays is faster than parsing code :P
24 $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
25 $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
27 if(empty( $wikiUpperChars) ||
empty($wikiLowerChars )) {
28 require_once( "includes/Utf8Case.php" );
29 $wgMemc->set( $key1, $wikiUpperChars );
30 $wgMemc->set( $key2, $wikiLowerChars );
35 * Base stuff useful to all UTF-8 based language files
38 class LanguageUtf8
extends Language
{
40 # These functions use mbstring library, if it is loaded
41 # or compiled and character mapping arrays otherwise.
42 # In case of language-specific character mismatch
43 # it should be dealt with in Language classes.
45 function ucfirst( $str ) {
46 return $this->uc( $str, true );
49 function uc( $str, $first = false ) {
50 if ( function_exists( 'mb_strtoupper' ) )
51 return $first ?
mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ) : mb_strtoupper( $str );
53 global $wikiUpperChars;
54 $x = $first ?
'^' : '';
56 "/$x([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
57 "strtr( \"\$1\" , \$wikiUpperChars )",
63 function lcfirst( $str ) {
64 return $this->lc( $str, true );
67 function lc( $str, $first = false ) {
68 if ( function_exists( 'mb_strtolower' ) )
69 return $first ?
mb_strtolower( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 ) : mb_strtolower( $str );
71 global $wikiLowerChars;
72 $x = $first ?
'^' : '';
74 "/$x([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
75 "strtr( \"\$1\" , \$wikiLowerChars )",
81 function stripForSearch( $string ) {
82 # MySQL fulltext index doesn't grok utf-8, so we
83 # need to fold cases and convert to hex
85 # In Language:: it just returns lowercase, maybe
86 # all strtolower on stripped output or argument
87 # should be removed and all stripForSearch
88 # methods adjusted to that.
90 wfProfileIn( "LanguageUtf8::stripForSearch" );
91 if( function_exists( 'mb_strtolower' ) ) {
93 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
94 "'U8' . bin2hex( \"$1\" )",
95 mb_strtolower( $string ) );
97 global $wikiLowerChars;
99 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
100 "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
103 wfProfileOut( "LanguageUtf8::stripForSearch" );
107 function fallback8bitEncoding() {
108 # Windows codepage 1252 is a superset of iso 8859-1
109 # override this to use difference source encoding to
110 # translate incoming 8-bit URLs.
111 return "windows-1252";
114 function checkTitleEncoding( $s ) {
115 global $wgInputEncoding;
117 if( is_array( $s ) ) {
118 wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
120 # Check for non-UTF-8 URLs
121 $ishigh = preg_match( '/[\x80-\xff]/', $s);
122 if(!$ishigh) return $s;
124 $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
125 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
126 if( $isutf8 ) return $s;
128 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
131 function firstChar( $s ) {
132 preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
133 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
135 return isset( $matches[1] ) ?
$matches[1] : "";
138 # Crop a string from the beginning or end to a certain number of bytes.
139 # (Bytes are used because our storage has limited byte lengths for some
140 # columns in the database.) Multibyte charsets will need to make sure that
141 # only whole characters are included!
143 # $length does not include the optional ellipsis.
144 # If $length is negative, snip from the beginning
145 function truncate( $string, $length, $ellipsis = "" ) {
149 if ( strlen( $string ) <= abs( $length ) ) {
153 $string = substr( $string, 0, $length );
154 $char = ord( $string[strlen( $string ) - 1] );
156 # We got the first byte only of a multibyte char; remove it.
157 $string = substr( $string, 0, -1 );
158 } elseif( $char >= 0x80 &&
159 preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
160 '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
161 # We chopped in the middle of a character; remove it
164 return $string . $ellipsis;
166 $string = substr( $string, $length );
167 $char = ord( $string[0] );
168 if( $char >= 0x80 && $char < 0xc0 ) {
169 # We chopped in the middle of a character; remove the whole thing
170 $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
172 return $ellipsis . $string;