www/ecrire/inc/charsets.php

   1 <?php
   2
   3 /***************************************************************************\
   4  *  SPIP, Systeme de publication pour l'internet                           *
   5  *                                                                         *
   6  *  Copyright (c) 2001-2011                                                *
   7  *  Arnaud Martin, Antoine Pitrou, Philippe Riviere, Emmanuel Saint-James  *
   8  *                                                                         *
   9  *  Ce programme est un logiciel libre distribue sous licence GNU/GPL.     *
  10  *  Pour plus de details voir le fichier COPYING.txt ou l'aide en ligne.   *
  11 \***************************************************************************/
  12
  13
  14 //
  15 if (!defined('_ECRIRE_INC_VERSION')) return;
  16
  17
  18 /*
  19  * charsets supportes en natif : voir les tables dans ecrire/charsets/
  20  * les autres charsets sont supportes via mbstring()
  21  */
  22
  23 // http://doc.spip.org/@load_charset
  24 function load_charset ($charset = 'AUTO', $langue_site = 'AUTO') {
  25         if ($charset == 'AUTO')
  26                 $charset = $GLOBALS['meta']['charset'];
  27         $charset = trim(strtolower($charset));
  28         if (isset($GLOBALS['CHARSET'][$charset]))
  29                 return $charset;
  30
  31         if ($langue_site == 'AUTO')
  32                 $langue_site = $GLOBALS['meta']['langue_site'];
  33
  34         if ($charset == 'utf-8') {
  35                 $GLOBALS['CHARSET'][$charset] = array();
  36                 return $charset;
  37         }
  38
  39         // Quelques synonymes
  40         if ($charset == '') $charset = 'iso-8859-1';
  41         else if ($charset == 'windows-1250') $charset = 'cp1250';
  42         else if ($charset == 'windows-1251') $charset = 'cp1251';
  43         else if ($charset == 'windows-1256') $charset = 'cp1256';
  44
  45         if (find_in_path($charset . '.php', 'charsets/', true)) {
  46                 return $charset;
  47         } else {
  48                 spip_log("Erreur: pas de fichier de conversion 'charsets/$charset'");
  49                 $GLOBALS['CHARSET'][$charset] = array();
  50                 return false;
  51         }
  52 }
  53
  54 //
  55 // Verifier qu'on peut utiliser mb_string
  56 //
  57 // http://doc.spip.org/@init_mb_string
  58 function init_mb_string() {
  59         static $mb;
  60
  61         // verifier que tout est present (fonctions mb_string pour php >= 4.0.6)
  62         // et que le charset interne est connu de mb_string
  63         if (!$mb) {
  64                 if (function_exists('mb_internal_encoding')
  65                 AND function_exists('mb_detect_order')
  66                 AND function_exists('mb_substr')
  67                 AND function_exists('mb_strlen')
  68                 AND function_exists('mb_encode_mimeheader')
  69                 AND function_exists('mb_encode_numericentity')
  70                 AND function_exists('mb_decode_numericentity')
  71                 AND mb_detect_order($GLOBALS['meta']['charset'])
  72                 ) {
  73                         mb_internal_encoding('utf-8');
  74                         $mb = 1;
  75                 } else
  76                         $mb = -1;
  77         }
  78
  79         return ($mb == 1);
  80 }
  81
  82 // Detecter les versions buggees d'iconv
  83 // http://doc.spip.org/@test_iconv
  84 function test_iconv() {
  85         static $iconv_ok;
  86
  87         if (!$iconv_ok) {
  88                 if (!function_exists('iconv'))
  89                         $iconv_ok = -1;
  90                 else {
  91                         if (utf_32_to_unicode(@iconv('utf-8', 'utf-32', 'chaine de test')) == 'chaine de test')
  92                                 $iconv_ok = 1;
  93                         else
  94                                 $iconv_ok = -1;
  95                 }
  96         }
  97         return ($iconv_ok == 1);
  98 }
  99
 100 // Test de fonctionnement du support UTF-8 dans PCRE
 101 // (contournement bug Debian Woody)
 102 // http://doc.spip.org/@test_pcre_unicode
 103 function test_pcre_unicode() {
 104         static $pcre_ok = 0;
 105
 106         if (!$pcre_ok) {
 107                 $s = " ".chr(195).chr(169)."t".chr(195).chr(169)." ";
 108                 if (preg_match(',\W...\W,u', $s)) $pcre_ok = 1;
 109                 else $pcre_ok = -1;
 110         }
 111         return $pcre_ok == 1;
 112 }
 113
 114 // Plages alphanumeriques (incomplet...)
 115 // http://doc.spip.org/@pcre_lettres_unicode
 116 function pcre_lettres_unicode() {
 117         static $plage_unicode;
 118
 119         if (!$plage_unicode) {
 120                 if (test_pcre_unicode()) {
 121                         // cf. http://www.unicode.org/charts/
 122                         $plage_unicode = '\w' // iso-latin
 123                                 . '\x{100}-\x{24f}' // europeen etendu
 124                                 . '\x{300}-\x{1cff}' // des tas de trucs
 125                         ;
 126                 }
 127                 else {
 128                         // fallback a trois sous
 129                         $plage_unicode = '\w';
 130                 }
 131         }
 132         return $plage_unicode;
 133 }
 134
 135 // Plage ponctuation de 0x2000 a 0x206F
 136 // (i.e. de 226-128-128 a 226-129-176)
 137 // http://doc.spip.org/@plage_punct_unicode
 138 function plage_punct_unicode() {
 139         return '\xE2(\x80[\x80-\xBF]|\x81[\x80-\xAF])';
 140 }
 141
 142 // corriger caracteres non-conformes : 128-159
 143 // cf. charsets/iso-8859-1.php (qu'on recopie ici pour aller plus vite)
 144 // on peut passer un charset cible en parametre pour accelerer le passage iso-8859-1 -> autre charset
 145 // http://doc.spip.org/@corriger_caracteres_windows
 146 function corriger_caracteres_windows($texte, $charset='AUTO', $charset_cible='unicode') {
 147         static $trans;
 148
 149         if (is_array($texte)) {
 150                 return array_map('corriger_caracteres_windows', $texte);
 151         }
 152
 153         if ($charset=='AUTO') $charset = $GLOBALS['meta']['charset'];
 154         if ($charset == 'utf-8') {
 155                 $p = chr(194);
 156                 if (strpos($texte,$p)===false) return $texte;
 157         } else if ($charset == 'iso-8859-1') {
 158                 $p = '';
 159         } else
 160                 return $texte;
 161
 162         if (!isset($trans[$charset][$charset_cible])) {
 163                 $trans[$charset][$charset_cible] = array(
 164                         $p.chr(128) => "&#8364;",
 165                         $p.chr(129) => ' ', # pas affecte
 166                         $p.chr(130) => "&#8218;",
 167                         $p.chr(131) => "&#402;",
 168                         $p.chr(132) => "&#8222;",
 169                         $p.chr(133) => "&#8230;",
 170                         $p.chr(134) => "&#8224;",
 171                         $p.chr(135) => "&#8225;",
 172                         $p.chr(136) => "&#710;",
 173                         $p.chr(137) => "&#8240;",
 174                         $p.chr(138) => "&#352;",
 175                         $p.chr(139) => "&#8249;",
 176                         $p.chr(140) => "&#338;",
 177                         $p.chr(141) => ' ', # pas affecte
 178                         $p.chr(142) => "&#381;",
 179                         $p.chr(143) => ' ', # pas affecte
 180                         $p.chr(144) => ' ', # pas affecte
 181                         $p.chr(145) => "&#8216;",
 182                         $p.chr(146) => "&#8217;",
 183                         $p.chr(147) => "&#8220;",
 184                         $p.chr(148) => "&#8221;",
 185                         $p.chr(149) => "&#8226;",
 186                         $p.chr(150) => "&#8211;",
 187                         $p.chr(151) => "&#8212;",
 188                         $p.chr(152) => "&#732;",
 189                         $p.chr(153) => "&#8482;",
 190                         $p.chr(154) => "&#353;",
 191                         $p.chr(155) => "&#8250;",
 192                         $p.chr(156) => "&#339;",
 193                         $p.chr(157) => ' ', # pas affecte
 194                         $p.chr(158) => "&#382;",
 195                         $p.chr(159) => "&#376;",
 196                 );
 197                 if ($charset_cible!='unicode'){
 198                         foreach($trans[$charset][$charset_cible] as $k=>$c)
 199                                 $trans[$charset][$charset_cible][$k] = unicode2charset($c, $charset_cible);
 200                 }
 201         }
 202
 203         return @str_replace(array_keys($trans[$charset][$charset_cible]),
 204                            array_values($trans[$charset][$charset_cible]),$texte);
 205 }
 206
 207
 208 //
 209 // Transformer les &eacute; en &#123;
 210 // $secure = true pour *ne pas convertir* les caracteres malins &lt; &amp; etc.
 211 //
 212 // http://doc.spip.org/@html2unicode
 213 function html2unicode($texte, $secure=false) {
 214         if (strpos($texte,'&') === false) return $texte;
 215         static $trans = array();
 216         if (!$trans) {
 217                 global $CHARSET;
 218                 load_charset('html');
 219                 foreach ($CHARSET['html'] as $key => $val) {
 220                         $trans["&$key;"] = $val;
 221                 }
 222         }
 223
 224         if ($secure)
 225                 return str_replace(array_keys($trans),array_values($trans),$texte);
 226         else
 227                 return str_replace(array('&amp;', '&quot;', '&lt;', '&gt;'),array('&', '"', '<', '>'),
 228                   str_replace(array_keys($trans),array_values($trans),$texte)
 229                 );
 230 }
 231
 232 //
 233 // Transformer les &eacute; en &#123;
 234 //
 235 // http://doc.spip.org/@mathml2unicode
 236 function mathml2unicode($texte) {
 237         static $trans;
 238         if (!$trans) {
 239                 global $CHARSET;
 240                 load_charset('mathml');
 241
 242                 foreach ($CHARSET['mathml'] as $key => $val)
 243                         $trans["&$key;"] = $val;
 244         }
 245
 246         return str_replace(array_keys($trans),array_values($trans),$texte);
 247 }
 248
 249
 250 //
 251 // Transforme une chaine en entites unicode &#129;
 252 //
 253 // Note: l'argument $forcer est obsolete : il visait a ne pas
 254 // convertir les accents iso-8859-1
 255 // http://doc.spip.org/@charset2unicode
 256 function charset2unicode($texte, $charset='AUTO' /* $forcer: obsolete*/) {
 257         static $trans;
 258
 259         if ($charset == 'AUTO')
 260                 $charset = $GLOBALS['meta']['charset'];
 261
 262         if ($charset == '') $charset = 'iso-8859-1';
 263         $charset = strtolower($charset);
 264
 265         switch ($charset) {
 266         case 'utf-8':
 267         case 'utf8':
 268                 return utf_8_to_unicode($texte);
 269
 270         case 'iso-8859-1':
 271                 $texte = corriger_caracteres_windows($texte, 'iso-8859-1');
 272                 // pas de break; ici, on suit sur default:
 273
 274         default:
 275                 // mbstring presente ?
 276                 if (init_mb_string()) {
 277                         if ($order = mb_detect_order() # mb_string connait-il $charset?
 278                         AND mb_detect_order($charset)) {
 279                                 $s = mb_convert_encoding($texte, 'utf-8', $charset);
 280                                 if ($s && $s != $texte) return utf_8_to_unicode($s);
 281                         }
 282                         mb_detect_order($order); # remettre comme precedemment
 283                 }
 284
 285                 // Sinon, peut-etre connaissons-nous ce charset ?
 286                 if (!isset($trans[$charset])) {
 287                         global $CHARSET;
 288                         if ($cset = load_charset($charset)
 289                         AND is_array($CHARSET[$cset]))
 290                                 foreach ($CHARSET[$cset] as $key => $val) {
 291                                         $trans[$charset][chr($key)] = '&#'.$val.';';
 292                         }
 293                 }
 294                 if (count($trans[$charset]))
 295                         return str_replace(array_keys($trans[$charset]),array_values($trans[$charset]),$texte);
 296
 297                 // Sinon demander a iconv (malgre le fait qu'il coupe quand un
 298                 // caractere n'appartient pas au charset, mais c'est un probleme
 299                 // surtout en utf-8, gere ci-dessus)
 300                 if (test_iconv()) {
 301                         $s = iconv($charset, 'utf-32le', $texte);
 302                         if ($s) return utf_32_to_unicode($s);
 303                 }
 304
 305                 // Au pire ne rien faire
 306                 spip_log("erreur charset '$charset' non supporte");
 307                 return $texte;
 308         }
 309 }
 310
 311 //
 312 // Transforme les entites unicode &#129; dans le charset specifie
 313 // Attention on ne transforme pas les entites < &#128; car si elles
 314 // ont ete encodees ainsi c'est a dessein
 315 // http://doc.spip.org/@unicode2charset
 316 function unicode2charset($texte, $charset='AUTO') {
 317         static $CHARSET_REVERSE;
 318         static $trans = array();
 319
 320         if ($charset == 'AUTO')
 321                 $charset = $GLOBALS['meta']['charset'];
 322
 323         switch($charset) {
 324         case 'utf-8':
 325                 return unicode_to_utf_8($texte);
 326                 break;
 327
 328         default:
 329                 $charset = load_charset($charset);
 330
 331                 if (!is_array($CHARSET_REVERSE[$charset])) {
 332                         $CHARSET_REVERSE[$charset] = array_flip($GLOBALS['CHARSET'][$charset]);
 333                 }
 334
 335                 if (!isset($trans[$charset])){
 336                         $trans[$charset]=array();
 337                         $t = &$trans[$charset];
 338                         for($e=128;$e<255;$e++){
 339                                 $h = dechex($e);
 340                                 if ($s = isset($CHARSET_REVERSE[$charset][$e])){
 341                                         $s = $CHARSET_REVERSE[$charset][$e];
 342                                         $t['&#'.$e.';'] = $t['&#0'.$e.';'] = $t['&#00'.$e.';'] = chr($s);
 343                                         $t['&#x'.$h.';'] = $t['&#x0'.$h.';'] = $t['&#x00'.$h.';'] = chr($s);
 344                                 }
 345                                 else{
 346                                         $t['&#'.$e.';'] = $t['&#0'.$e.';'] = $t['&#00'.$e.';'] = chr($e);
 347                                         $t['&#x'.$h.';'] = $t['&#x0'.$h.';'] = $t['&#x00'.$h.';'] = chr($e);
 348                                 }
 349                         }
 350                 }
 351                 $texte = str_replace(array_keys($trans[$charset]),array_values($trans[$charset]),$texte);
 352                 return $texte;
 353         }
 354 }
 355
 356
 357 // Importer un texte depuis un charset externe vers le charset du site
 358 // (les caracteres non resolus sont transformes en &#123;)
 359 // http://doc.spip.org/@importer_charset
 360 function importer_charset($texte, $charset = 'AUTO') {
 361         // on traite le cas le plus frequent iso-8859-1 vers utf directement pour aller plus vite !
 362         if (($charset == 'iso-8859-1') && ($GLOBALS['meta']['charset']=='utf-8') && function_exists('utf8_encode')){
 363                 $texte = corriger_caracteres_windows($texte, 'iso-8859-1','unicode');
 364                 $texte = utf8_encode($texte);
 365                 return $texte;
 366         }
 367         return unicode2charset(charset2unicode($texte, $charset));
 368 }
 369
 370 // UTF-8
 371 // http://doc.spip.org/@utf_8_to_unicode
 372 function utf_8_to_unicode($source) {
 373
 374         // mb_string : methode rapide
 375         if (init_mb_string()) {
 376                 $convmap = array(0x7F, 0xFFFFFF, 0x0, 0xFFFFFF);
 377                 return mb_encode_numericentity($source, $convmap, 'UTF-8');
 378         }
 379
 380         // Sinon methode pas a pas
 381         static $decrement;
 382         static $shift;
 383
 384         // Cf. php.net, par Ronen. Adapte pour compatibilite < php4
 385         if (!is_array($decrement)) {
 386                 // array used to figure what number to decrement from character order value
 387                 // according to number of characters used to map unicode to ascii by utf-8
 388                 $decrement[4] = 240;
 389                 $decrement[3] = 224;
 390                 $decrement[2] = 192;
 391                 $decrement[1] = 0;
 392                 // the number of bits to shift each charNum by
 393                 $shift[1][0] = 0;
 394                 $shift[2][0] = 6;
 395                 $shift[2][1] = 0;
 396                 $shift[3][0] = 12;
 397                 $shift[3][1] = 6;
 398                 $shift[3][2] = 0;
 399                 $shift[4][0] = 18;
 400                 $shift[4][1] = 12;
 401                 $shift[4][2] = 6;
 402                 $shift[4][3] = 0;
 403         }
 404
 405         $pos = 0;
 406         $len = strlen ($source);
 407         $encodedString = '';
 408         while ($pos < $len) {
 409                 $char = '';
 410                 $ischar = false;
 411                 $asciiPos = ord (substr ($source, $pos, 1));
 412                 if (($asciiPos >= 240) && ($asciiPos <= 255)) {
 413                         // 4 chars representing one unicode character
 414                         $thisLetter = substr ($source, $pos, 4);
 415                         $pos += 4;
 416                 }
 417                 else if (($asciiPos >= 224) && ($asciiPos <= 239)) {
 418                         // 3 chars representing one unicode character
 419                         $thisLetter = substr ($source, $pos, 3);
 420                         $pos += 3;
 421                 }
 422                 else if (($asciiPos >= 192) && ($asciiPos <= 223)) {
 423                         // 2 chars representing one unicode character
 424                         $thisLetter = substr ($source, $pos, 2);
 425                         $pos += 2;
 426                 }
 427                 else {
 428                         // 1 char (lower ascii)
 429                         $thisLetter = substr ($source, $pos, 1);
 430                         $pos += 1;
 431                         $char = $thisLetter;
 432                         $ischar = true;
 433                 }
 434
 435                 if ($ischar)
 436                         $encodedString .= $char;
 437                 else {  // process the string representing the letter to a unicode entity
 438                         $thisLen = strlen ($thisLetter);
 439                         $thisPos = 0;
 440                         $decimalCode = 0;
 441                         while ($thisPos < $thisLen) {
 442                                 $thisCharOrd = ord (substr ($thisLetter, $thisPos, 1));
 443                                 if ($thisPos == 0) {
 444                                         $charNum = intval ($thisCharOrd - $decrement[$thisLen]);
 445                                         $decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
 446                                 } else {
 447                                         $charNum = intval ($thisCharOrd - 128);
 448                                         $decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
 449                                 }
 450                                 $thisPos++;
 451                         }
 452                         $encodedLetter = "&#". preg_replace('/^0+/', '', $decimalCode) . ';';
 453                         $encodedString .= $encodedLetter;
 454                 }
 455         }
 456         return $encodedString;
 457 }
 458
 459 // UTF-32 ne sert plus que si on passe par iconv, c'est-a-dire quand
 460 // mb_string est absente ou ne connait pas notre charset
 461 // mais on l'optimise quand meme par mb_string
 462 // => tout ca sera osolete quand on sera surs d'avoir mb_string
 463 // http://doc.spip.org/@utf_32_to_unicode
 464 function utf_32_to_unicode($source) {
 465
 466         // mb_string : methode rapide
 467         if (init_mb_string()) {
 468                 $convmap = array(0x7F, 0xFFFFFF, 0x0, 0xFFFFFF);
 469                 $source = mb_encode_numericentity($source, $convmap, 'UTF-32LE');
 470                 return str_replace(chr(0), '', $source);
 471         }
 472
 473         // Sinon methode lente
 474         $texte = '';
 475         while ($source) {
 476                 $words = unpack("V*", substr($source, 0, 1024));
 477                 $source = substr($source, 1024);
 478                 foreach ($words as $word) {
 479                         if ($word < 128)
 480                                 $texte .= chr($word);
 481                         // ignorer le BOM - http://www.unicode.org/faq/utf_bom.html
 482                         else if ($word != 65279)
 483                                 $texte .= '&#'.$word.';';
 484                 }
 485         }
 486         return $texte;
 487
 488 }
 489
 490 // Ce bloc provient de php.net, auteur Ronen
 491 // http://doc.spip.org/@caractere_utf_8
 492 function caractere_utf_8($num) {
 493         if($num<128)
 494                 return chr($num);
 495         if($num<2048)
 496                 return chr(($num>>6)+192).chr(($num&63)+128);
 497         if($num<65536)
 498                 return chr(($num>>12)+224).chr((($num>>6)&63)+128).chr(($num&63)+128);
 499         if($num<1114112)
 500                 return chr($num>>18+240).chr((($num>>12)&63)+128).chr(($num>>6)&63+128). chr($num&63+128);
 501         return '';
 502 }
 503
 504 // http://doc.spip.org/@unicode_to_utf_8
 505 function unicode_to_utf_8($texte) {
 506
 507         // 1. Entites &#128; et suivantes
 508         $vu = array();
 509         if (preg_match_all(',&#0*([1-9][0-9][0-9]+);,S',
 510         $texte, $regs, PREG_SET_ORDER))
 511         foreach ($regs as $reg) {
 512                 if ($reg[1]>127 AND !isset($vu[$reg[0]]))
 513                         $vu[$reg[0]] = caractere_utf_8($reg[1]);
 514         }
 515         //$texte = str_replace(array_keys($vu), array_values($vu), $texte);
 516
 517         // 2. Entites > &#xFF;
 518         //$vu = array();
 519         if (preg_match_all(',&#x0*([1-9a-f][0-9a-f][0-9a-f]+);,iS',
 520         $texte, $regs, PREG_SET_ORDER))
 521         foreach ($regs as $reg) {
 522                 if (!isset($vu[$reg[0]]))
 523                         $vu[$reg[0]] = caractere_utf_8(hexdec($reg[1]));
 524         }
 525         return str_replace(array_keys($vu), array_values($vu), $texte);
 526
 527 }
 528
 529 // convertit les &#264; en \u0108
 530 // http://doc.spip.org/@unicode_to_javascript
 531 function unicode_to_javascript($texte) {
 532         $vu = array();
 533         while (preg_match(',&#0*([0-9]+);,S', $texte, $regs) AND !isset($vu[$regs[1]])) {
 534                 $num = $regs[1];
 535                 $vu[$num] = true;
 536                 $s = '\u'.sprintf("%04x", $num);
 537                 $texte = str_replace($regs[0], $s, $texte);
 538         }
 539         return $texte;
 540 }
 541
 542 // convertit les %uxxxx (envoyes par javascript)
 543 // http://doc.spip.org/@javascript_to_unicode
 544 function javascript_to_unicode ($texte) {
 545         while (preg_match(",%u([0-9A-F][0-9A-F][0-9A-F][0-9A-F]),", $texte, $regs))
 546                 $texte = str_replace($regs[0],"&#".hexdec($regs[1]).";", $texte);
 547         return $texte;
 548 }
 549 // convertit les %E9 (envoyes par le browser) en chaine du charset du site (binaire)
 550 // http://doc.spip.org/@javascript_to_binary
 551 function javascript_to_binary ($texte) {
 552         while (preg_match(",%([0-9A-F][0-9A-F]),", $texte, $regs))
 553                 $texte = str_replace($regs[0],chr(hexdec($regs[1])), $texte);
 554         return $texte;
 555 }
 556
 557
 558 // http://doc.spip.org/@translitteration_rapide
 559 function translitteration_rapide($texte, $charset='AUTO', $complexe='') {
 560         static $trans;
 561         if ($charset == 'AUTO')
 562                 $charset = $GLOBALS['meta']['charset'];
 563         if (!strlen($texte))
 564                 return $texte;
 565
 566         $table_translit ='translit'.$complexe;
 567
 568         // 2. Translitterer grace a la table predefinie
 569         if (!$trans[$complexe]) {
 570                 global $CHARSET;
 571                 load_charset($table_translit);
 572                 foreach ($CHARSET[$table_translit] as $key => $val)
 573                         $trans[$complexe][caractere_utf_8($key)] = $val;
 574         }
 575
 576         return str_replace(array_keys($trans[$complexe]),array_values($trans[$complexe]),$texte);
 577 }
 578
 579 //
 580 // Translitteration charset => ascii (pour l'indexation)
 581 // Attention les caracteres non reconnus sont renvoyes en utf-8
 582 //
 583 // http://doc.spip.org/@translitteration
 584 function translitteration($texte, $charset='AUTO', $complexe='') {
 585         // 0. Supprimer les caracteres illegaux
 586         include_spip('inc/filtres');
 587         $texte = corriger_caracteres($texte);
 588
 589         // 1. Passer le charset et les &eacute en utf-8
 590         $texte = unicode_to_utf_8(html2unicode(charset2unicode($texte, $charset, true)));
 591
 592         return translitteration_rapide($texte,$charset,$complexe);
 593 }
 594
 595 // &agrave; est retourne sous la forme "a`" et pas "a"
 596 // mais si $chiffre=true, on retourne "a8" (vietnamien)
 597 // http://doc.spip.org/@translitteration_complexe
 598 function translitteration_complexe($texte, $chiffres=false) {
 599         $texte = translitteration($texte,'AUTO','complexe');
 600
 601         if ($chiffres) {
 602                 $texte = preg_replace("/[aeiuoyd]['`?~.^+(-]{1,2}/eS",
 603                         "translitteration_chiffree('\\0')", $texte);
 604         }
 605
 606         return $texte;
 607 }
 608 // http://doc.spip.org/@translitteration_chiffree
 609 function translitteration_chiffree($car) {
 610         return strtr($car, "'`?~.^+(-", "123456789");
 611 }
 612
 613
 614 // Reconnaitre le BOM utf-8 (0xEFBBBF)
 615 // http://doc.spip.org/@bom_utf8
 616 function bom_utf8($texte) {
 617         return (substr($texte, 0,3) == chr(0xEF).chr(0xBB).chr(0xBF));
 618 }
 619 // Verifie qu'un document est en utf-8 valide
 620 // http://us2.php.net/manual/fr/function.mb-detect-encoding.php#50087
 621 // http://w3.org/International/questions/qa-forms-utf-8.html
 622 // note: preg_replace permet de contourner un "stack overflow" sur PCRE
 623 // http://doc.spip.org/@is_utf8
 624 function is_utf8($string) {
 625         return !strlen(
 626         preg_replace(
 627           ',[\x09\x0A\x0D\x20-\x7E]'            # ASCII
 628         . '|[\xC2-\xDF][\x80-\xBF]'             # non-overlong 2-byte
 629         . '|\xE0[\xA0-\xBF][\x80-\xBF]'         # excluding overlongs
 630         . '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'  # straight 3-byte
 631         . '|\xED[\x80-\x9F][\x80-\xBF]'         # excluding surrogates
 632         . '|\xF0[\x90-\xBF][\x80-\xBF]{2}'      # planes 1-3
 633         . '|[\xF1-\xF3][\x80-\xBF]{3}'          # planes 4-15
 634         . '|\xF4[\x80-\x8F][\x80-\xBF]{2}'      # plane 16
 635         . ',sS',
 636         '', $string));
 637 }
 638 // http://doc.spip.org/@is_ascii
 639 function is_ascii($string) {
 640         return !strlen(
 641         preg_replace(
 642         ',[\x09\x0A\x0D\x20-\x7E],sS',
 643         '', $string));
 644 }
 645
 646 // Transcode une page (attrapee sur le web, ou un squelette) en essayant
 647 // par tous les moyens de deviner son charset (y compris headers HTTP)
 648 // http://doc.spip.org/@transcoder_page
 649 function transcoder_page($texte, $headers='') {
 650
 651         // Si tout est < 128 pas la peine d'aller plus loin
 652         if (is_ascii($texte)) {
 653                 #spip_log('charset: ascii');
 654                 return $texte;
 655         }
 656
 657         // Reconnaitre le BOM utf-8 (0xEFBBBF)
 658         if (bom_utf8($texte)) {
 659                 $charset = 'utf-8';
 660                 $texte = substr($texte,3);
 661         }
 662
 663         // charset precise par le contenu (xml)
 664         else if (preg_match(
 665         ',<[?]xml[^>]*encoding[^>]*=[^>]*([-_a-z0-9]+?),UimsS', $texte, $regs))
 666                 $charset = trim(strtolower($regs[1]));
 667         // charset precise par le contenu (html)
 668         else if (preg_match(
 669         ',<(meta|html|body)[^>]*charset[^>]*=[^>]*([-_a-z0-9]+?),UimsS',
 670         $texte, $regs)
 671         # eviter #CHARSET des squelettes
 672         AND (($tmp = trim(strtolower($regs[2]))) != 'charset'))
 673                 $charset = $tmp;
 674         // charset de la reponse http
 675         else if (preg_match(',charset=([-_a-z0-9]+),i', $headers, $regs))
 676                 $charset = trim(strtolower($regs[1]));
 677         else $charset = '';
 678         // normaliser les noms du shif-jis japonais
 679         if (preg_match(',^(x|shift)[_-]s?jis$,i', $charset))
 680                 $charset = 'shift-jis';
 681
 682         if ($charset) {
 683                 spip_log("charset: $charset");
 684         } else {
 685                 // valeur par defaut
 686                 if (is_utf8($texte))
 687                         $charset = 'utf-8';
 688                 else
 689                         $charset = 'iso-8859-1';
 690                 spip_log("charset probable: $charset");
 691         }
 692
 693         return importer_charset($texte, $charset);
 694 }
 695
 696
 697 //
 698 // Gerer les outils mb_string
 699 //
 700 // http://doc.spip.org/@spip_substr
 701 function spip_substr($c, $start=0, $length = NULL) {
 702         // Si ce n'est pas utf-8, utiliser substr
 703         if ($GLOBALS['meta']['charset'] != 'utf-8') {
 704                 if ($length)
 705                         return substr($c, $start, $length);
 706                 else
 707                         substr($c, $start);
 708         }
 709
 710         // Si utf-8, voir si on dispose de mb_string
 711         if (init_mb_string()) {
 712                 if ($length)
 713                         return mb_substr($c, $start, $length);
 714                 else
 715                         return mb_substr($c, $start);
 716         }
 717
 718         // Version manuelle (cf. ci-dessous)
 719         return spip_substr_manuelle($c, $start, $length);
 720 }
 721
 722 // version manuelle de substr utf8, pour php vieux et/ou mal installe
 723 // http://doc.spip.org/@spip_substr_manuelle
 724 function spip_substr_manuelle($c, $start, $length = NULL) {
 725
 726         // Cas pathologique
 727         if ($length === 0)
 728                 return '';
 729
 730         // S'il y a un demarrage, on se positionne
 731         if ($start > 0)
 732                 $c = substr($c, strlen(spip_substr_manuelle($c, 0, $start)));
 733         elseif ($start < 0)
 734                 return spip_substr_manuelle($c, spip_strlen($c)+$start, $length);
 735
 736         if (!$length)
 737                 return $c;
 738
 739         if ($length > 0) {
 740                 // on prend n fois la longueur desiree, pour etre surs d'avoir tout
 741                 // (un caractere utf-8 prenant au maximum n bytes)
 742                 $n = 0; while (preg_match(',[\x80-\xBF]{'.(++$n).'},', $c));
 743                 $c = substr($c, 0, $n*$length);
 744                 // puis, tant qu'on est trop long, on coupe...
 745                 while (($l = spip_strlen($c)) > $length)
 746                         $c = substr($c, 0, $length - $l);
 747                 return $c;
 748         }
 749
 750         // $length < 0
 751         return spip_substr_manuelle($c, 0, spip_strlen($c)+$length);
 752 }
 753
 754 // http://doc.spip.org/@spip_strlen
 755 function spip_strlen($c) {
 756         // Si ce n'est pas utf-8, utiliser strlen
 757         if ($GLOBALS['meta']['charset'] != 'utf-8')
 758                 return strlen($c);
 759
 760         // Sinon, utiliser mb_strlen() si disponible
 761         if (init_mb_string())
 762                 return mb_strlen($c);
 763
 764         // Methode manuelle : on supprime les bytes 10......,
 765         // on compte donc les ascii (0.......) et les demarrages
 766         // de caracteres utf-8 (11......)
 767         return strlen(preg_replace(',[\x80-\xBF],S', '', $c));
 768 }
 769
 770 // Initialisation
 771 $GLOBALS['CHARSET'] = Array();
 772
 773 // noter a l'occasion dans la meta pcre_u notre capacite a utiliser le flag /u
 774 // dans les preg_replace pour ne pas casser certaines lettres accentuees :
 775 // en utf-8 chr(195).chr(160) = a` alors qu'en iso-latin chr(160) = nbsp
 776 if (!isset($GLOBALS['meta']['pcre_u'])
 777   OR (isset($_GET['var_mode']) AND !isset($_GET['var_profile']))) {
 778         include_spip('inc/meta');
 779         ecrire_meta('pcre_u',
 780                 $u = ($GLOBALS['meta']['charset'] == 'utf-8'
 781                 AND test_pcre_unicode())
 782                         ? 'u' :''
 783         );
 784 }
 785
 786 ?>