languages/LanguageUtf8.php

   1 <?php
   2 #$Id$
   3 if( defined( "MEDIAWIKI" ) ) {
   4
   5 # This file and LanguageLatin1.php may be included from within functions, so
   6 # we need to have global statements
   7
   8 global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
   9 global $wgDBname, $wgMemc;
  10
  11 $wgInputEncoding    = "UTF-8";
  12 $wgOutputEncoding       = "UTF-8";
  13
  14 if (function_exists('mb_internal_encoding')) {
  15         mb_internal_encoding('UTF-8');
  16 } else {
  17         # Hack our own case conversion routines
  18
  19         # Loading serialized arrays is faster than parsing code :P
  20         $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
  21         $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
  22
  23         if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
  24                 require_once( "includes/Utf8Case.php" );
  25                 $wgMemc->set( $key1, $wikiUpperChars );
  26                 $wgMemc->set( $key2, $wikiLowerChars );
  27         }
  28 }
  29
  30 # Base stuff useful to all UTF-8 based language files
  31 class LanguageUtf8 extends Language {
  32
  33         # These two functions use mbstring library, if it is loaded
  34         # or compiled and character mapping arrays otherwise.
  35         # In case of language-specific character mismatch
  36         # it should be dealt with in Language classes.
  37
  38         function ucfirst( $string ) {
  39                 /**
  40                  * On pages with many links we can get called a lot.
  41                  * The multibyte uppercase functions are relatively
  42                  * slow, so check first if we can use a faster ASCII
  43                  * version instead; it saves a few milliseconds.
  44                  */
  45                 if( preg_match( '/^[\x80-\xff]/', $string ) ) {
  46                         if (function_exists('mb_strtoupper')) {
  47                                 return mb_strtoupper(mb_substr($string,0,1)).mb_substr($string,1);
  48                         } else {
  49                                 global $wikiUpperChars;
  50                                 return preg_replace (
  51                                         "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
  52                                         "strtr ( \"\$1\" , \$wikiUpperChars )",
  53                                         $string );
  54                         }
  55                 }
  56                 return ucfirst( $string );
  57         }
  58
  59         function lcfirst( $string ) {
  60                 if (function_exists('mb_strtolower')) {
  61                         return mb_strtolower(mb_substr($string,0,1)).mb_substr($string,1);
  62                 } else {
  63                     global $wikiLowerChars;
  64                     return preg_replace (
  65                     "/^([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
  66                     "strtr ( \"\$1\" , \$wikiLowerChars )",
  67                     $string );
  68                 }
  69         }
  70
  71         function stripForSearch( $string ) {
  72                 # MySQL fulltext index doesn't grok utf-8, so we
  73                 # need to fold cases and convert to hex
  74
  75                 # In Language:: it just returns lowercase, maybe
  76                 # all strtolower on stripped output or argument
  77                 # should be removed and all stripForSearch
  78                 # methods adjusted to that.
  79
  80                 wfProfileIn( "LanguageUtf8::stripForSearch" );
  81                 if( function_exists( 'mb_strtolower' ) ) {
  82                         $out = preg_replace(
  83                                 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
  84                                 "'U8' . bin2hex( \"$1\" )",
  85                                 mb_strtolower( $string ) );
  86                 } else {
  87                         global $wikiLowerChars;
  88                         $out = preg_replace(
  89                                 "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
  90                                 "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
  91                                 $string );
  92                 }
  93                 wfProfileOut( "LanguageUtf8::stripForSearch" );
  94                 return $out;
  95         }
  96
  97         function fallback8bitEncoding() {
  98                 # Windows codepage 1252 is a superset of iso 8859-1
  99                 # override this to use difference source encoding to
 100                 # translate incoming 8-bit URLs.
 101                 return "windows-1252";
 102         }
 103
 104         function checkTitleEncoding( $s ) {
 105                 global $wgInputEncoding;
 106
 107                 # Check for non-UTF-8 URLs
 108                 $ishigh = preg_match( '/[\x80-\xff]/', $s);
 109                 if(!$ishigh) return $s;
 110
 111                 $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 112                 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
 113                 if( $isutf8 ) return $s;
 114
 115                 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
 116         }
 117
 118         function firstChar( $s ) {
 119                 preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
 120                 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
 121
 122                 return isset( $matches[1] ) ? $matches[1] : "";
 123         }
 124
 125         # Crop a string from the beginning or end to a certain number of bytes.
 126         # (Bytes are used because our storage has limited byte lengths for some
 127         # columns in the database.) Multibyte charsets will need to make sure that
 128         # only whole characters are included!
 129         #
 130         # $length does not include the optional ellipsis.
 131         # If $length is negative, snip from the beginning
 132         function truncate( $string, $length, $ellipsis = "" ) {
 133                 if( $length == 0 ) {
 134                         return $ellipsis;
 135                 }
 136                 if ( strlen( $string ) <= abs( $length ) ) {
 137                         return $string;
 138                 }
 139                 if( $length > 0 ) {
 140                         $string = substr( $string, 0, $length );
 141                         $char = ord( $string[strlen( $string ) - 1] );
 142                         if ($char >= 0xc0) {
 143                                 # We got the first byte only of a multibyte char; remove it.
 144                                 $string = substr( $string, 0, -1 );
 145                         } elseif( $char >= 0x80 &&
 146                                   preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
 147                                               '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
 148                             # We chopped in the middle of a character; remove it
 149                                 $string = $m[1];
 150                         }
 151                         return $string . $ellipsis;
 152                 } else {
 153                         $string = substr( $string, $length );
 154                         $char = ord( $string[0] );
 155                         if( $char >= 0x80 && $char < 0xc0 ) {
 156                                 # We chopped in the middle of a character; remove the whole thing
 157                                 $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
 158                         }
 159                         return $ellipsis . $string;
 160                 }
 161         }
 162 }
 163
 164 } # ifdef MEDIAWIKI
 165
 166 ?>