Some minor edits, and a couple of added messages for Special:Makesysop
[lhc/web/wiklou.git] / languages / LanguageUtf8.php
1 <?php
2
3 if( defined( "MEDIAWIKI" ) ) {
4
5 # This file and LanguageLatin1.php may be included from within functions, so
6 # we need to have global statements
7
8 global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
9 global $wgDBname, $wgMemc;
10
11 $wgInputEncoding = "UTF-8";
12 $wgOutputEncoding = "UTF-8";
13
14 if (function_exists('mb_internal_encoding')) {
15 mb_internal_encoding('UTF-8');
16 } else {
17 # Hack our own case conversion routines
18
19 # Loading serialized arrays is faster than parsing code :P
20 $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
21 $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
22
23 if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
24 require_once( "includes/Utf8Case.php" );
25 $wgMemc->set( $key1, $wikiUpperChars );
26 $wgMemc->set( $key2, $wikiLowerChars );
27 }
28 }
29
30 # Base stuff useful to all UTF-8 based language files
31 class LanguageUtf8 extends Language {
32
33 # These two functions use mbstring library, if it is loaded
34 # or compiled and character mapping arrays otherwise.
35 # In case of language-specific character mismatch
36 # it should be dealt with in Language classes.
37
38 function ucfirst( $string ) {
39 /**
40 * On pages with many links we can get called a lot.
41 * The multibyte uppercase functions are relatively
42 * slow, so check first if we can use a faster ASCII
43 * version instead; it saves a few milliseconds.
44 */
45 if( preg_match( '/^[\x80-\xff]/', $string ) ) {
46 if (function_exists('mb_strtoupper')) {
47 return mb_strtoupper(mb_substr($string,0,1)).mb_substr($string,1);
48 } else {
49 global $wikiUpperChars;
50 return preg_replace (
51 "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
52 "strtr ( \"\$1\" , \$wikiUpperChars )",
53 $string );
54 }
55 }
56 return ucfirst( $string );
57 }
58
59 function lcfirst( $string ) {
60 if (function_exists('mb_strtolower')) {
61 return mb_strtolower(mb_substr($string,0,1)).mb_substr($string,1);
62 } else {
63 global $wikiLowerChars;
64 return preg_replace (
65 "/^([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
66 "strtr ( \"\$1\" , \$wikiLowerChars )",
67 $string );
68 }
69 }
70
71 function stripForSearch( $string ) {
72 # MySQL fulltext index doesn't grok utf-8, so we
73 # need to fold cases and convert to hex
74
75 # In Language:: it just returns lowercase, maybe
76 # all strtolower on stripped output or argument
77 # should be removed and all stripForSearch
78 # methods adjusted to that.
79
80 wfProfileIn( "LanguageUtf8::stripForSearch" );
81 if( function_exists( 'mb_strtolower' ) ) {
82 $out = preg_replace(
83 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
84 "'U8' . bin2hex( \"$1\" )",
85 mb_strtolower( $string ) );
86 } else {
87 global $wikiLowerChars;
88 $out = preg_replace(
89 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
90 "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
91 $string );
92 }
93 wfProfileOut( "LanguageUtf8::stripForSearch" );
94 return $out;
95 }
96
97 function fallback8bitEncoding() {
98 # Windows codepage 1252 is a superset of iso 8859-1
99 # override this to use difference source encoding to
100 # translate incoming 8-bit URLs.
101 return "windows-1252";
102 }
103
104 function checkTitleEncoding( $s ) {
105 global $wgInputEncoding;
106
107 if( is_array( $s ) ) {
108 wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
109 }
110 # Check for non-UTF-8 URLs
111 $ishigh = preg_match( '/[\x80-\xff]/', $s);
112 if(!$ishigh) return $s;
113
114 $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
115 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
116 if( $isutf8 ) return $s;
117
118 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
119 }
120
121 function firstChar( $s ) {
122 preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
123 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
124
125 return isset( $matches[1] ) ? $matches[1] : "";
126 }
127
128 # Crop a string from the beginning or end to a certain number of bytes.
129 # (Bytes are used because our storage has limited byte lengths for some
130 # columns in the database.) Multibyte charsets will need to make sure that
131 # only whole characters are included!
132 #
133 # $length does not include the optional ellipsis.
134 # If $length is negative, snip from the beginning
135 function truncate( $string, $length, $ellipsis = "" ) {
136 if( $length == 0 ) {
137 return $ellipsis;
138 }
139 if ( strlen( $string ) <= abs( $length ) ) {
140 return $string;
141 }
142 if( $length > 0 ) {
143 $string = substr( $string, 0, $length );
144 $char = ord( $string[strlen( $string ) - 1] );
145 if ($char >= 0xc0) {
146 # We got the first byte only of a multibyte char; remove it.
147 $string = substr( $string, 0, -1 );
148 } elseif( $char >= 0x80 &&
149 preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
150 '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
151 # We chopped in the middle of a character; remove it
152 $string = $m[1];
153 }
154 return $string . $ellipsis;
155 } else {
156 $string = substr( $string, $length );
157 $char = ord( $string[0] );
158 if( $char >= 0x80 && $char < 0xc0 ) {
159 # We chopped in the middle of a character; remove the whole thing
160 $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
161 }
162 return $ellipsis . $string;
163 }
164 }
165 }
166
167 } # ifdef MEDIAWIKI
168
169 ?>