languages/LanguageUtf8.php

   1 <?php
   2 #$Id$
   3 if( defined( "MEDIAWIKI" ) ) {
   4
   5 $wgInputEncoding    = "UTF-8";
   6 $wgOutputEncoding       = "UTF-8";
   7
   8 if (function_exists('mb_internal_encoding')) {
   9         mb_internal_encoding('UTF-8');
  10 }
  11
  12 $wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
  13 $wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
  14
  15 if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
  16         require_once( "includes/Utf8Case.php" );
  17         $wgMemc->set( $key1, $wikiUpperChars );
  18         $wgMemc->set( $key2, $wikiLowerChars );
  19 }
  20
  21 # Base stuff useful to all UTF-8 based language files
  22 class LanguageUtf8 extends Language {
  23
  24         # These two functions use mbstring library, if it is loaded
  25         # or compiled and character mapping arrays otherwise.
  26         # In case of language-specific character mismatch
  27         # it should be dealt with in Language classes.
  28
  29         function ucfirst( $string ) {
  30                 if (function_exists('mb_strtoupper')) {
  31                         return mb_strtoupper(mb_substr($string,0,1)).mb_substr($string,1);
  32                 } else {
  33                     global $wikiUpperChars;
  34                     return preg_replace (
  35                     "/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
  36                     "strtr ( \"\$1\" , \$wikiUpperChars )",
  37                     $string );
  38                 }
  39         }
  40
  41         function lcfirst( $string ) {
  42                 if (function_exists('mb_strtolower')) {
  43                         return mb_strtolower(mb_substr($string,0,1)).mb_substr($string,1);
  44                 } else {
  45                     global $wikiLowerChars;
  46                     return preg_replace (
  47                     "/^([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
  48                     "strtr ( \"\$1\" , \$wikiLowerChars )",
  49                     $string );
  50                 }
  51         }
  52
  53         function stripForSearch( $string ) {
  54                 # MySQL fulltext index doesn't grok utf-8, so we
  55                 # need to fold cases and convert to hex
  56
  57                 # In Language:: it just returns lowercase, maybe
  58                 # all strtolower on stripped output or argument
  59                 # should be removed and all stripForSearch
  60                 # methods adjusted to that.
  61                 if (function_exists('mb_strtolower')) {
  62                         return preg_replace(
  63                             "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
  64                             "'U8' . bin2hex( $1 )",
  65                             mb_strtolower($string) );
  66                 } else {
  67                   global $wikiLowerChars;
  68                   return preg_replace(
  69                       "/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
  70                       "'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
  71                       $string );
  72                 }
  73         }
  74
  75         function fallback8bitEncoding() {
  76                 # Windows codepage 1252 is a superset of iso 8859-1
  77                 # override this to use difference source encoding to
  78                 # translate incoming 8-bit URLs.
  79                 return "windows-1252";
  80         }
  81
  82         function checkTitleEncoding( $s ) {
  83                 global $wgInputEncoding;
  84
  85                 # Check for non-UTF-8 URLs
  86                 $ishigh = preg_match( '/[\x80-\xff]/', $s);
  87                 if(!$ishigh) return $s;
  88
  89                 $isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
  90                 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
  91                 if( $isutf8 ) return $s;
  92
  93                 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
  94         }
  95
  96         function firstChar( $s ) {
  97                 preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
  98                 '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
  99
 100                 return isset( $matches[1] ) ? $matches[1] : "";
 101         }
 102
 103         # Crop a string from the beginning or end to a certain number of bytes.
 104         # (Bytes are used because our storage has limited byte lengths for some
 105         # columns in the database.) Multibyte charsets will need to make sure that
 106         # only whole characters are included!
 107         #
 108         # $length does not include the optional ellipsis.
 109         # If $length is negative, snip from the beginning
 110         function truncate( $string, $length, $ellipsis = "" ) {
 111                 if( $length == 0 ) {
 112                         return $ellipsis;
 113                 }
 114                 if ( strlen( $string ) <= abs( $length ) ) {
 115                         return $string;
 116                 }
 117                 if( $length > 0 ) {
 118                         $string = substr( $string, 0, $length );
 119                         $char = ord( $string[strlen( $string ) - 1] );
 120                         if ($char >= 0xc0) {
 121                                 # We got the first byte only of a multibyte char; remove it.
 122                                 $string = substr( $string, 0, -1 );
 123                         } elseif( $char >= 0x80 &&
 124                                   preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
 125                                               '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
 126                             # We chopped in the middle of a character; remove it
 127                                 $string = $m[1];
 128                         }
 129                         return $string . $ellipsis;
 130                 } else {
 131                         $string = substr( $string, $length );
 132                         $char = ord( $string[0] );
 133                         if( $char >= 0x80 && $char < 0xc0 ) {
 134                                 # We chopped in the middle of a character; remove the whole thing
 135                                 $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
 136                         }
 137                         return $ellipsis . $string;
 138                 }
 139         }
 140 }
 141
 142 } # ifdef MEDIAWIKI
 143
 144 ?>