www/ecrire/inc/charsets.php

   1 <?php
   2
   3 /***************************************************************************\
   4  *  SPIP, Systeme de publication pour l'internet                           *
   5  *                                                                         *
   6  *  Copyright (c) 2001-2012                                                *
   7  *  Arnaud Martin, Antoine Pitrou, Philippe Riviere, Emmanuel Saint-James  *
   8  *                                                                         *
   9  *  Ce programme est un logiciel libre distribue sous licence GNU/GPL.     *
  10  *  Pour plus de details voir le fichier COPYING.txt ou l'aide en ligne.   *
  11 \***************************************************************************/
  12
  13 /**
  14  * Gestion des charsets et des conversions
  15  *
  16  * Ce fichier contient les fonctions relatives à la gestion de charsets,
  17  * à la conversion de textes dans différents charsets et
  18  * propose des fonctions émulant la librairie mb si elle est absente
  19  *
  20  * @package SPIP\Texte\Charsets
  21 **/
  22
  23 // securité
  24 if (!defined('_ECRIRE_INC_VERSION')) return;
  25
  26
  27 /**
  28  * Charge en mémoire la liste des caractères d'un charset
  29  *
  30  * Charsets supportes en natif : voir les tables dans ecrire/charsets/
  31  * Les autres charsets sont supportes via mbstring()
  32  *
  33  * @param string $charset
  34  *     Charset à charger
  35  *     Par défaut (AUTO), utilise le charset du site
  36  * @return string|bool
  37  *     Nom du charset
  38  *     false si le charset n'est pas décrit dans le répertoire charsets/
  39 **/
  40 function load_charset ($charset = 'AUTO') {
  41         if ($charset == 'AUTO')
  42                 $charset = $GLOBALS['meta']['charset'];
  43         $charset = trim(strtolower($charset));
  44         if (isset($GLOBALS['CHARSET'][$charset]))
  45                 return $charset;
  46
  47         if ($charset == 'utf-8') {
  48                 $GLOBALS['CHARSET'][$charset] = array();
  49                 return $charset;
  50         }
  51
  52         // Quelques synonymes
  53         if ($charset == '') $charset = 'iso-8859-1';
  54         else if ($charset == 'windows-1250') $charset = 'cp1250';
  55         else if ($charset == 'windows-1251') $charset = 'cp1251';
  56         else if ($charset == 'windows-1256') $charset = 'cp1256';
  57
  58         if (find_in_path($charset . '.php', 'charsets/', true)) {
  59                 return $charset;
  60         } else {
  61                 spip_log("Erreur: pas de fichier de conversion 'charsets/$charset'");
  62                 $GLOBALS['CHARSET'][$charset] = array();
  63                 return false;
  64         }
  65 }
  66
  67
  68 /**
  69  * Verifier qu'on peut utiliser mb_string
  70  *
  71  * @return bool
  72  *     true si toutes les fonctions mb nécessaires sont présentes
  73 **/
  74 function init_mb_string() {
  75         static $mb;
  76
  77         // verifier que tout est present (fonctions mb_string pour php >= 4.0.6)
  78         // et que le charset interne est connu de mb_string
  79         if (!$mb) {
  80                 if (function_exists('mb_internal_encoding')
  81                 AND function_exists('mb_detect_order')
  82                 AND function_exists('mb_substr')
  83                 AND function_exists('mb_strlen')
  84                 AND function_exists('mb_encode_mimeheader')
  85                 AND function_exists('mb_encode_numericentity')
  86                 AND function_exists('mb_decode_numericentity')
  87                 AND mb_detect_order($GLOBALS['meta']['charset'])
  88                 ) {
  89                         mb_internal_encoding('utf-8');
  90                         $mb = 1;
  91                 } else
  92                         $mb = -1;
  93         }
  94
  95         return ($mb == 1);
  96 }
  97
  98 /**
  99  * Test le fonctionnement correct d'iconv
 100  *
 101  * Celui-ci coupe sur certaines versions la chaine
 102  * quand un caractère n'appartient pas au charset
 103  *
 104  * @link http://php.net/manual/fr/function.iconv.php
 105  *
 106  * @return bool
 107  *     true si iconv fonctionne correctement
 108 **/
 109 function test_iconv() {
 110         static $iconv_ok;
 111
 112         if (!$iconv_ok) {
 113                 if (!function_exists('iconv'))
 114                         $iconv_ok = -1;
 115                 else {
 116                         if (utf_32_to_unicode(@iconv('utf-8', 'utf-32', 'chaine de test')) == 'chaine de test')
 117                                 $iconv_ok = 1;
 118                         else
 119                                 $iconv_ok = -1;
 120                 }
 121         }
 122         return ($iconv_ok == 1);
 123 }
 124
 125
 126 /**
 127  * Test de fonctionnement du support UTF-8 dans PCRE
 128  *
 129  * Contournement bug Debian Woody
 130  *
 131  * @return bool
 132  *     true si PCRE supporte l'UTF-8 correctement
 133 **/
 134 function test_pcre_unicode() {
 135         static $pcre_ok = 0;
 136
 137         if (!$pcre_ok) {
 138                 $s = " ".chr(195).chr(169)."t".chr(195).chr(169)." ";
 139                 if (preg_match(',\W...\W,u', $s)) $pcre_ok = 1;
 140                 else $pcre_ok = -1;
 141         }
 142         return $pcre_ok == 1;
 143 }
 144
 145 /**
 146  * Renvoie une plage de caractères alphanumeriques unicodes (incomplet...)
 147  *
 148  * Retourne pour une expression rationnelle une plage
 149  * de caractères alphanumériques à utiliser entre crochets [$plage]
 150  *
 151  * @internal
 152  *    N'est pas utilisé
 153  *    Servait à inc/ortho passé dans le grenier
 154  * @return string
 155  *    Plage de caractères
 156 **/
 157 function pcre_lettres_unicode() {
 158         static $plage_unicode;
 159
 160         if (!$plage_unicode) {
 161                 if (test_pcre_unicode()) {
 162                         // cf. http://www.unicode.org/charts/
 163                         $plage_unicode = '\w' // iso-latin
 164                                 . '\x{100}-\x{24f}' // europeen etendu
 165                                 . '\x{300}-\x{1cff}' // des tas de trucs
 166                         ;
 167                 }
 168                 else {
 169                         // fallback a trois sous
 170                         $plage_unicode = '\w';
 171                 }
 172         }
 173         return $plage_unicode;
 174 }
 175
 176
 177 /**
 178  * Renvoie une plage de caractères de ponctuation unicode de 0x2000 a 0x206F
 179  *
 180  * Retourne pour une expression rationnelle une plage
 181  * de caractères de ponctuation à utiliser entre crochets [$plage]
 182  * (i.e. de 226-128-128 a 226-129-176)
 183  *
 184  * @internal
 185  *    N'est pas utilisé
 186  *    Servait à inc/ortho passé dans le grenier
 187  * @return string
 188  *    Plage de caractères
 189 **/
 190 function plage_punct_unicode() {
 191         return '\xE2(\x80[\x80-\xBF]|\x81[\x80-\xAF])';
 192 }
 193
 194 /**
 195  * Corriger des caractères non-conformes : 128-159
 196  *
 197  * Cf. charsets/iso-8859-1.php (qu'on recopie ici pour aller plus vite)
 198  * On peut passer un charset cible en parametre pour accelerer le passage iso-8859-1 -> autre charset
 199  *
 200  * @param string $texte
 201  *     Le texte à corriger
 202  * @param string $charset
 203  *     Charset d'origine du texte
 204  *     Par défaut (AUTO) utilise le charset du site
 205  * @param string $charset_cible
 206  *     Charset de destination (unicode par défaut)
 207  * @return string
 208  *     Texte corrigé
 209 **/
 210 function corriger_caracteres_windows($texte, $charset='AUTO', $charset_cible='unicode') {
 211         static $trans;
 212
 213         if (is_array($texte)) {
 214                 return array_map('corriger_caracteres_windows', $texte);
 215         }
 216
 217         if ($charset=='AUTO') $charset = $GLOBALS['meta']['charset'];
 218         if ($charset == 'utf-8') {
 219                 $p = chr(194);
 220                 if (strpos($texte,$p)==false)
 221                         return $texte;
 222         } else if ($charset == 'iso-8859-1') {
 223                 $p = '';
 224         } else
 225                 return $texte;
 226
 227         if (!isset($trans[$charset][$charset_cible])) {
 228                 $trans[$charset][$charset_cible] = array(
 229                         $p.chr(128) => "&#8364;",
 230                         $p.chr(129) => ' ', # pas affecte
 231                         $p.chr(130) => "&#8218;",
 232                         $p.chr(131) => "&#402;",
 233                         $p.chr(132) => "&#8222;",
 234                         $p.chr(133) => "&#8230;",
 235                         $p.chr(134) => "&#8224;",
 236                         $p.chr(135) => "&#8225;",
 237                         $p.chr(136) => "&#710;",
 238                         $p.chr(137) => "&#8240;",
 239                         $p.chr(138) => "&#352;",
 240                         $p.chr(139) => "&#8249;",
 241                         $p.chr(140) => "&#338;",
 242                         $p.chr(141) => ' ', # pas affecte
 243                         $p.chr(142) => "&#381;",
 244                         $p.chr(143) => ' ', # pas affecte
 245                         $p.chr(144) => ' ', # pas affecte
 246                         $p.chr(145) => "&#8216;",
 247                         $p.chr(146) => "&#8217;",
 248                         $p.chr(147) => "&#8220;",
 249                         $p.chr(148) => "&#8221;",
 250                         $p.chr(149) => "&#8226;",
 251                         $p.chr(150) => "&#8211;",
 252                         $p.chr(151) => "&#8212;",
 253                         $p.chr(152) => "&#732;",
 254                         $p.chr(153) => "&#8482;",
 255                         $p.chr(154) => "&#353;",
 256                         $p.chr(155) => "&#8250;",
 257                         $p.chr(156) => "&#339;",
 258                         $p.chr(157) => ' ', # pas affecte
 259                         $p.chr(158) => "&#382;",
 260                         $p.chr(159) => "&#376;",
 261                 );
 262                 if ($charset_cible!='unicode'){
 263                         foreach($trans[$charset][$charset_cible] as $k=>$c)
 264                                 $trans[$charset][$charset_cible][$k] = unicode2charset($c, $charset_cible);
 265                 }
 266         }
 267
 268         return @str_replace(array_keys($trans[$charset][$charset_cible]),
 269                            array_values($trans[$charset][$charset_cible]),$texte);
 270 }
 271
 272
 273
 274 /**
 275  * Transforme les entités HTML en unicode
 276  *
 277  * Transforme les &eacute; en &#123;
 278  *
 279  * @param string $texte
 280  *     Texte à convertir
 281  * @param bool $secure
 282  *     true pour *ne pas convertir* les caracteres malins &lt; &amp; etc.
 283  * @return string
 284  *     Texte converti
 285 **/
 286 function html2unicode($texte, $secure=false) {
 287         if (strpos($texte,'&') === false) return $texte;
 288         static $trans = array();
 289         if (!$trans) {
 290                 global $CHARSET;
 291                 load_charset('html');
 292                 foreach ($CHARSET['html'] as $key => $val) {
 293                         $trans["&$key;"] = $val;
 294                 }
 295         }
 296
 297         if ($secure)
 298                 return str_replace(array_keys($trans),array_values($trans),$texte);
 299         else
 300                 return str_replace(array('&amp;', '&quot;', '&lt;', '&gt;'),array('&', '"', '<', '>'),
 301                   str_replace(array_keys($trans),array_values($trans),$texte)
 302                 );
 303 }
 304
 305
 306 /**
 307  * Transforme les entités mathématiques (MathML) en unicode
 308  *
 309  * Transforme &angle; en &#x2220; ainsi que toutes autres entités mathématiques
 310  *
 311  * @param string $texte
 312  *     Texte à convertir
 313  * @return string
 314  *     Texte converti
 315 **/
 316 function mathml2unicode($texte) {
 317         static $trans;
 318         if (!$trans) {
 319                 global $CHARSET;
 320                 load_charset('mathml');
 321
 322                 foreach ($CHARSET['mathml'] as $key => $val)
 323                         $trans["&$key;"] = $val;
 324         }
 325
 326         return str_replace(array_keys($trans),array_values($trans),$texte);
 327 }
 328
 329
 330 /**
 331  * Transforme une chaine en entites unicode &#129;
 332  *
 333  * Utilise la librairie mb si elle est présente.
 334  *
 335  * @internal
 336  *     Note: l'argument $forcer est obsolete : il visait a ne pas
 337  *     convertir les accents iso-8859-1
 338  *
 339  * @param string $texte
 340  *     Texte à convertir
 341  * @param string $charset
 342  *     Charset actuel du texte
 343  *     Par défaut (AUTO), le charset est celui du site.
 344  * @return string
 345  *     Texte converti en unicode
 346 **/
 347 function charset2unicode($texte, $charset='AUTO' /* $forcer: obsolete*/) {
 348         static $trans;
 349
 350         if ($charset == 'AUTO')
 351                 $charset = $GLOBALS['meta']['charset'];
 352
 353         if ($charset == '') $charset = 'iso-8859-1';
 354         $charset = strtolower($charset);
 355
 356         switch ($charset) {
 357         case 'utf-8':
 358         case 'utf8':
 359                 return utf_8_to_unicode($texte);
 360
 361         case 'iso-8859-1':
 362                 $texte = corriger_caracteres_windows($texte, 'iso-8859-1');
 363                 // pas de break; ici, on suit sur default:
 364
 365         default:
 366                 // mbstring presente ?
 367                 if (init_mb_string()) {
 368                         if ($order = mb_detect_order() # mb_string connait-il $charset?
 369                         AND mb_detect_order($charset)) {
 370                                 $s = mb_convert_encoding($texte, 'utf-8', $charset);
 371                                 if ($s && $s != $texte) return utf_8_to_unicode($s);
 372                         }
 373                         mb_detect_order($order); # remettre comme precedemment
 374                 }
 375
 376                 // Sinon, peut-etre connaissons-nous ce charset ?
 377                 if (!isset($trans[$charset])) {
 378                         global $CHARSET;
 379                         if ($cset = load_charset($charset)
 380                         AND is_array($CHARSET[$cset]))
 381                                 foreach ($CHARSET[$cset] as $key => $val) {
 382                                         $trans[$charset][chr($key)] = '&#'.$val.';';
 383                         }
 384                 }
 385                 if (count($trans[$charset]))
 386                         return str_replace(array_keys($trans[$charset]),array_values($trans[$charset]),$texte);
 387
 388                 // Sinon demander a iconv (malgre le fait qu'il coupe quand un
 389                 // caractere n'appartient pas au charset, mais c'est un probleme
 390                 // surtout en utf-8, gere ci-dessus)
 391                 if (test_iconv()) {
 392                         $s = iconv($charset, 'utf-32le', $texte);
 393                         if ($s) return utf_32_to_unicode($s);
 394                 }
 395
 396                 // Au pire ne rien faire
 397                 spip_log("erreur charset '$charset' non supporte");
 398                 return $texte;
 399         }
 400 }
 401
 402
 403 /**
 404  * Transforme les entites unicode &#129; dans le charset specifie
 405  *
 406  * Attention on ne transforme pas les entites < &#128; car si elles
 407  * ont ete encodees ainsi c'est a dessein
 408  *
 409  * @param string $texte
 410  *     Texte unicode à transformer
 411  * @param string $charset
 412  *     Charset à appliquer au texte
 413  *     Par défaut (AUTO), le charset sera celui du site.
 414  * @return string
 415  *     Texte transformé dans le charset souhaité
 416 **/
 417 function unicode2charset($texte, $charset='AUTO') {
 418         static $CHARSET_REVERSE;
 419         static $trans = array();
 420
 421         if ($charset == 'AUTO')
 422                 $charset = $GLOBALS['meta']['charset'];
 423
 424         switch($charset) {
 425         case 'utf-8':
 426                 return unicode_to_utf_8($texte);
 427                 break;
 428
 429         default:
 430                 $charset = load_charset($charset);
 431
 432                 if (!is_array($CHARSET_REVERSE[$charset])) {
 433                         $CHARSET_REVERSE[$charset] = array_flip($GLOBALS['CHARSET'][$charset]);
 434                 }
 435
 436                 if (!isset($trans[$charset])){
 437                         $trans[$charset]=array();
 438                         $t = &$trans[$charset];
 439                         for($e=128;$e<255;$e++){
 440                                 $h = dechex($e);
 441                                 if ($s = isset($CHARSET_REVERSE[$charset][$e])){
 442                                         $s = $CHARSET_REVERSE[$charset][$e];
 443                                         $t['&#'.$e.';'] = $t['&#0'.$e.';'] = $t['&#00'.$e.';'] = chr($s);
 444                                         $t['&#x'.$h.';'] = $t['&#x0'.$h.';'] = $t['&#x00'.$h.';'] = chr($s);
 445                                 }
 446                                 else{
 447                                         $t['&#'.$e.';'] = $t['&#0'.$e.';'] = $t['&#00'.$e.';'] = chr($e);
 448                                         $t['&#x'.$h.';'] = $t['&#x0'.$h.';'] = $t['&#x00'.$h.';'] = chr($e);
 449                                 }
 450                         }
 451                 }
 452                 $texte = str_replace(array_keys($trans[$charset]),array_values($trans[$charset]),$texte);
 453                 return $texte;
 454         }
 455 }
 456
 457
 458 /**
 459  * Importer un texte depuis un charset externe vers le charset du site
 460  *
 461  * Les caracteres non resolus sont transformes en &#123;
 462  *
 463  * @param string $texte
 464  *     Texte unicode à importer
 465  * @param string $charset
 466  *     Charset d'origine du texte
 467  *     Par défaut (AUTO), le charset d'origine est celui du site.
 468  * @return string
 469  *     Texte transformé dans le charset site
 470 **/
 471 function importer_charset($texte, $charset = 'AUTO') {
 472         static $trans = array();
 473         // on traite le cas le plus frequent iso-8859-1 vers utf directement pour aller plus vite !
 474         if (($charset == 'iso-8859-1') && ($GLOBALS['meta']['charset']=='utf-8')){
 475                 $texte = corriger_caracteres_windows($texte, 'iso-8859-1',$GLOBALS['meta']['charset']);
 476                 if (init_mb_string()) {
 477                         if ($order = mb_detect_order() # mb_string connait-il $charset?
 478                         AND mb_detect_order($charset)) {
 479                                 $s = mb_convert_encoding($texte, 'utf-8', $charset);
 480                         }
 481                         mb_detect_order($order); # remettre comme precedemment
 482                         return $s;
 483                 }
 484                 // Sinon, peut-etre connaissons-nous ce charset ?
 485                 if (!isset($trans[$charset])) {
 486                         global $CHARSET;
 487                         if ($cset = load_charset($charset)
 488                         AND is_array($CHARSET[$cset]))
 489                                 foreach ($CHARSET[$cset] as $key => $val) {
 490                                         $trans[$charset][chr($key)] = unicode2charset('&#'.$val.';');
 491                         }
 492                 }
 493                 if (count($trans[$charset]))
 494                         return str_replace(array_keys($trans[$charset]),array_values($trans[$charset]),$texte);
 495                 return $texte;
 496         }
 497         return unicode2charset(charset2unicode($texte, $charset));
 498 }
 499
 500
 501 /**
 502  * Transforme un texte UTF-8 en unicode
 503  *
 504  * Utilise la librairie mb si présente
 505  *
 506  * @param string $source
 507  *    Texte UTF-8 à transformer
 508  * @return string
 509  *    Texte transformé en unicode
 510 **/
 511 function utf_8_to_unicode($source) {
 512
 513         // mb_string : methode rapide
 514         if (init_mb_string()) {
 515                 $convmap = array(0x7F, 0xFFFFFF, 0x0, 0xFFFFFF);
 516                 return mb_encode_numericentity($source, $convmap, 'UTF-8');
 517         }
 518
 519         // Sinon methode pas a pas
 520         static $decrement;
 521         static $shift;
 522
 523         // Cf. php.net, par Ronen. Adapte pour compatibilite < php4
 524         if (!is_array($decrement)) {
 525                 // array used to figure what number to decrement from character order value
 526                 // according to number of characters used to map unicode to ascii by utf-8
 527                 $decrement[4] = 240;
 528                 $decrement[3] = 224;
 529                 $decrement[2] = 192;
 530                 $decrement[1] = 0;
 531                 // the number of bits to shift each charNum by
 532                 $shift[1][0] = 0;
 533                 $shift[2][0] = 6;
 534                 $shift[2][1] = 0;
 535                 $shift[3][0] = 12;
 536                 $shift[3][1] = 6;
 537                 $shift[3][2] = 0;
 538                 $shift[4][0] = 18;
 539                 $shift[4][1] = 12;
 540                 $shift[4][2] = 6;
 541                 $shift[4][3] = 0;
 542         }
 543
 544         $pos = 0;
 545         $len = strlen ($source);
 546         $encodedString = '';
 547         while ($pos < $len) {
 548                 $char = '';
 549                 $ischar = false;
 550                 $asciiPos = ord (substr ($source, $pos, 1));
 551                 if (($asciiPos >= 240) && ($asciiPos <= 255)) {
 552                         // 4 chars representing one unicode character
 553                         $thisLetter = substr ($source, $pos, 4);
 554                         $pos += 4;
 555                 }
 556                 else if (($asciiPos >= 224) && ($asciiPos <= 239)) {
 557                         // 3 chars representing one unicode character
 558                         $thisLetter = substr ($source, $pos, 3);
 559                         $pos += 3;
 560                 }
 561                 else if (($asciiPos >= 192) && ($asciiPos <= 223)) {
 562                         // 2 chars representing one unicode character
 563                         $thisLetter = substr ($source, $pos, 2);
 564                         $pos += 2;
 565                 }
 566                 else {
 567                         // 1 char (lower ascii)
 568                         $thisLetter = substr ($source, $pos, 1);
 569                         $pos += 1;
 570                         $char = $thisLetter;
 571                         $ischar = true;
 572                 }
 573
 574                 if ($ischar)
 575                         $encodedString .= $char;
 576                 else {  // process the string representing the letter to a unicode entity
 577                         $thisLen = strlen ($thisLetter);
 578                         $thisPos = 0;
 579                         $decimalCode = 0;
 580                         while ($thisPos < $thisLen) {
 581                                 $thisCharOrd = ord (substr ($thisLetter, $thisPos, 1));
 582                                 if ($thisPos == 0) {
 583                                         $charNum = intval ($thisCharOrd - $decrement[$thisLen]);
 584                                         $decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
 585                                 } else {
 586                                         $charNum = intval ($thisCharOrd - 128);
 587                                         $decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
 588                                 }
 589                                 $thisPos++;
 590                         }
 591                         $encodedLetter = "&#". preg_replace('/^0+/', '', $decimalCode) . ';';
 592                         $encodedString .= $encodedLetter;
 593                 }
 594         }
 595         return $encodedString;
 596 }
 597
 598 /**
 599  * Transforme un texte UTF-32 en unicode
 600  *
 601  * UTF-32 ne sert plus que si on passe par iconv, c'est-a-dire quand
 602  * mb_string est absente ou ne connait pas notre charset.
 603  *
 604  * Mais on l'optimise quand meme par mb_string
 605  * => tout ca sera osolete quand on sera surs d'avoir mb_string
 606  *
 607  * @param string $source
 608  *    Texte UTF-8 à transformer
 609  * @return string
 610  *    Texte transformé en unicode
 611 **/
 612 function utf_32_to_unicode($source) {
 613
 614         // mb_string : methode rapide
 615         if (init_mb_string()) {
 616                 $convmap = array(0x7F, 0xFFFFFF, 0x0, 0xFFFFFF);
 617                 $source = mb_encode_numericentity($source, $convmap, 'UTF-32LE');
 618                 return str_replace(chr(0), '', $source);
 619         }
 620
 621         // Sinon methode lente
 622         $texte = '';
 623         while ($source) {
 624                 $words = unpack("V*", substr($source, 0, 1024));
 625                 $source = substr($source, 1024);
 626                 foreach ($words as $word) {
 627                         if ($word < 128)
 628                                 $texte .= chr($word);
 629                         // ignorer le BOM - http://www.unicode.org/faq/utf_bom.html
 630                         else if ($word != 65279)
 631                                 $texte .= '&#'.$word.';';
 632                 }
 633         }
 634         return $texte;
 635
 636 }
 637
 638
 639 /**
 640  * Transforme un numéro unicode en caractère utf-8
 641  *
 642  * Ce bloc provient de php.net
 643  * @author Ronen
 644  *
 645  * @param int $num
 646  *    Numéro de l'entité unicode
 647  * @return char
 648  *    Caractère utf8 si trouvé, '' sinon
 649 **/
 650 function caractere_utf_8($num) {
 651         if($num<128)
 652                 return chr($num);
 653         if($num<2048)
 654                 return chr(($num>>6)+192).chr(($num&63)+128);
 655         if($num<65536)
 656                 return chr(($num>>12)+224).chr((($num>>6)&63)+128).chr(($num&63)+128);
 657         if($num<1114112)
 658                 return chr($num>>18+240).chr((($num>>12)&63)+128).chr(($num>>6)&63+128). chr($num&63+128);
 659         return '';
 660 }
 661
 662 /**
 663  * Convertit un texte unicode en utf-8
 664  *
 665  * @param string $texte
 666  *     Texte à convertir
 667  * @return string
 668  *     Texte converti
 669 **/
 670 function unicode_to_utf_8($texte) {
 671
 672         // 1. Entites &#128; et suivantes
 673         $vu = array();
 674         if (preg_match_all(',&#0*([1-9][0-9][0-9]+);,S',
 675         $texte, $regs, PREG_SET_ORDER))
 676         foreach ($regs as $reg) {
 677                 if ($reg[1]>127 AND !isset($vu[$reg[0]]))
 678                         $vu[$reg[0]] = caractere_utf_8($reg[1]);
 679         }
 680         //$texte = str_replace(array_keys($vu), array_values($vu), $texte);
 681
 682         // 2. Entites > &#xFF;
 683         //$vu = array();
 684         if (preg_match_all(',&#x0*([1-9a-f][0-9a-f][0-9a-f]+);,iS',
 685         $texte, $regs, PREG_SET_ORDER))
 686         foreach ($regs as $reg) {
 687                 if (!isset($vu[$reg[0]]))
 688                         $vu[$reg[0]] = caractere_utf_8(hexdec($reg[1]));
 689         }
 690         return str_replace(array_keys($vu), array_values($vu), $texte);
 691
 692 }
 693
 694 /**
 695  * Convertit les unicode &#264; en javascript \u0108
 696  *
 697  * @param string $texte
 698  *     Texte à convertir
 699  * @return string
 700  *     Texte converti
 701 **/
 702 function unicode_to_javascript($texte) {
 703         $vu = array();
 704         while (preg_match(',&#0*([0-9]+);,S', $texte, $regs) AND !isset($vu[$regs[1]])) {
 705                 $num = $regs[1];
 706                 $vu[$num] = true;
 707                 $s = '\u'.sprintf("%04x", $num);
 708                 $texte = str_replace($regs[0], $s, $texte);
 709         }
 710         return $texte;
 711 }
 712
 713 /**
 714  * Convertit les %uxxxx (envoyés par javascript) en &#yyy unicode
 715  *
 716  * @param string $texte
 717  *     Texte à convertir
 718  * @return string
 719  *     Texte converti
 720 **/
 721 function javascript_to_unicode ($texte) {
 722         while (preg_match(",%u([0-9A-F][0-9A-F][0-9A-F][0-9A-F]),", $texte, $regs))
 723                 $texte = str_replace($regs[0],"&#".hexdec($regs[1]).";", $texte);
 724         return $texte;
 725 }
 726
 727 /**
 728  * Convertit les %E9 (envoyés par le browser) en chaîne du charset du site (binaire)
 729  *
 730  * @param string $texte
 731  *     Texte à convertir
 732  * @return string
 733  *     Texte converti
 734 **/
 735 function javascript_to_binary ($texte) {
 736         while (preg_match(",%([0-9A-F][0-9A-F]),", $texte, $regs))
 737                 $texte = str_replace($regs[0],chr(hexdec($regs[1])), $texte);
 738         return $texte;
 739 }
 740
 741
 742 // http://doc.spip.org/@translitteration_rapide
 743 function translitteration_rapide($texte, $charset='AUTO', $complexe='') {
 744         static $trans;
 745         if ($charset == 'AUTO')
 746                 $charset = $GLOBALS['meta']['charset'];
 747         if (!strlen($texte))
 748                 return $texte;
 749
 750         $table_translit ='translit'.$complexe;
 751
 752         // 2. Translitterer grace a la table predefinie
 753         if (!$trans[$complexe]) {
 754                 global $CHARSET;
 755                 load_charset($table_translit);
 756                 foreach ($CHARSET[$table_translit] as $key => $val)
 757                         $trans[$complexe][caractere_utf_8($key)] = $val;
 758         }
 759
 760         return str_replace(array_keys($trans[$complexe]),array_values($trans[$complexe]),$texte);
 761 }
 762
 763 //
 764 // Translitteration charset => ascii (pour l'indexation)
 765 // Attention les caracteres non reconnus sont renvoyes en utf-8
 766 //
 767 // http://doc.spip.org/@translitteration
 768 function translitteration($texte, $charset='AUTO', $complexe='') {
 769         // 0. Supprimer les caracteres illegaux
 770         include_spip('inc/filtres');
 771         $texte = corriger_caracteres($texte);
 772
 773         // 1. Passer le charset et les &eacute en utf-8
 774         $texte = unicode_to_utf_8(html2unicode(charset2unicode($texte, $charset, true)));
 775
 776         return translitteration_rapide($texte,$charset,$complexe);
 777 }
 778
 779 // &agrave; est retourne sous la forme "a`" et pas "a"
 780 // mais si $chiffre=true, on retourne "a8" (vietnamien)
 781 // http://doc.spip.org/@translitteration_complexe
 782 function translitteration_complexe($texte, $chiffres=false) {
 783         $texte = translitteration($texte,'AUTO','complexe');
 784
 785         if ($chiffres) {
 786                 $texte = preg_replace("/[aeiuoyd]['`?~.^+(-]{1,2}/eS",
 787                         "translitteration_chiffree('\\0')", $texte);
 788         }
 789
 790         return $texte;
 791 }
 792 // http://doc.spip.org/@translitteration_chiffree
 793 function translitteration_chiffree($car) {
 794         return strtr($car, "'`?~.^+(-", "123456789");
 795 }
 796
 797
 798 /**
 799  * Reconnaitre le BOM utf-8 (0xEFBBBF)
 800  *
 801  * @param string $texte
 802  *    Texte dont on vérifie la présence du BOM
 803  * @return bool
 804  *    true s'il a un BOM
 805 **/
 806 function bom_utf8($texte) {
 807         return (substr($texte, 0,3) == chr(0xEF).chr(0xBB).chr(0xBF));
 808 }
 809
 810 /**
 811  * Vérifie qu'une chaîne est en utf-8 valide
 812  *
 813  * Note: preg_replace permet de contourner un "stack overflow" sur PCRE
 814  *
 815  * @link http://us2.php.net/manual/fr/function.mb-detect-encoding.php#50087
 816  * @link http://w3.org/International/questions/qa-forms-utf-8.html
 817  *
 818  * @param string $string
 819  *     Texte dont on vérifie qu'il est de l'utf-8
 820  * @return bool
 821  *     true si c'est le cas
 822 **/
 823 function is_utf8($string) {
 824         return !strlen(
 825         preg_replace(
 826           ',[\x09\x0A\x0D\x20-\x7E]'            # ASCII
 827         . '|[\xC2-\xDF][\x80-\xBF]'             # non-overlong 2-byte
 828         . '|\xE0[\xA0-\xBF][\x80-\xBF]'         # excluding overlongs
 829         . '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'  # straight 3-byte
 830         . '|\xED[\x80-\x9F][\x80-\xBF]'         # excluding surrogates
 831         . '|\xF0[\x90-\xBF][\x80-\xBF]{2}'      # planes 1-3
 832         . '|[\xF1-\xF3][\x80-\xBF]{3}'          # planes 4-15
 833         . '|\xF4[\x80-\x8F][\x80-\xBF]{2}'      # plane 16
 834         . ',sS',
 835         '', $string));
 836 }
 837
 838 /**
 839  * Vérifie qu'une chaîne est en ascii valide
 840  *
 841  * @param string $string
 842  *     Texte dont on vérifie qu'il est de l'ascii
 843  * @return bool
 844  *     true si c'est le cas
 845 **/
 846 function is_ascii($string) {
 847         return !strlen(
 848         preg_replace(
 849         ',[\x09\x0A\x0D\x20-\x7E],sS',
 850         '', $string));
 851 }
 852
 853 // Transcode une page (attrapee sur le web, ou un squelette) en essayant
 854 // par tous les moyens de deviner son charset (y compris headers HTTP)
 855 // http://doc.spip.org/@transcoder_page
 856 function transcoder_page($texte, $headers='') {
 857
 858         // Si tout est < 128 pas la peine d'aller plus loin
 859         if (is_ascii($texte)) {
 860                 #spip_log('charset: ascii');
 861                 return $texte;
 862         }
 863
 864         // Reconnaitre le BOM utf-8 (0xEFBBBF)
 865         if (bom_utf8($texte)) {
 866                 $charset = 'utf-8';
 867                 $texte = substr($texte,3);
 868         }
 869
 870         // charset precise par le contenu (xml)
 871         else if (preg_match(
 872         ',<[?]xml[^>]*encoding[^>]*=[^>]*([-_a-z0-9]+?),UimsS', $texte, $regs))
 873                 $charset = trim(strtolower($regs[1]));
 874         // charset precise par le contenu (html)
 875         else if (preg_match(
 876         ',<(meta|html|body)[^>]*charset[^>]*=[^>]*([-_a-z0-9]+?),UimsS',
 877         $texte, $regs)
 878         # eviter #CHARSET des squelettes
 879         AND (($tmp = trim(strtolower($regs[2]))) != 'charset'))
 880                 $charset = $tmp;
 881         // charset de la reponse http
 882         else if (preg_match(',charset=([-_a-z0-9]+),i', $headers, $regs))
 883                 $charset = trim(strtolower($regs[1]));
 884         else $charset = '';
 885         // normaliser les noms du shif-jis japonais
 886         if (preg_match(',^(x|shift)[_-]s?jis$,i', $charset))
 887                 $charset = 'shift-jis';
 888
 889         if ($charset) {
 890                 spip_log("charset: $charset");
 891         } else {
 892                 // valeur par defaut
 893                 if (is_utf8($texte))
 894                         $charset = 'utf-8';
 895                 else
 896                         $charset = 'iso-8859-1';
 897                 spip_log("charset probable: $charset");
 898         }
 899
 900         return importer_charset($texte, $charset);
 901 }
 902
 903
 904 //
 905 // Gerer les outils mb_string
 906 //
 907
 908 /**
 909  * Coupe un texte selon substr()
 910  *
 911  * Coupe une chaîne en utilisant les outils mb* lorsque le site est en utf8
 912  *
 913  * @link http://fr.php.net/manual/fr/function.mb-substr.php
 914  * @link http://www.php.net/manual/fr/function.substr.php
 915  *
 916  * @param string $c         Le texte
 917  * @param int $start        Début
 918  * @param null|int $length  Longueur ou fin
 919  * @return string
 920  *     Le texte coupé
 921 **/
 922 function spip_substr($c, $start=0, $length = NULL) {
 923         // Si ce n'est pas utf-8, utiliser substr
 924         if ($GLOBALS['meta']['charset'] != 'utf-8') {
 925                 if ($length)
 926                         return substr($c, $start, $length);
 927                 else
 928                         substr($c, $start);
 929         }
 930
 931         // Si utf-8, voir si on dispose de mb_string
 932         if (init_mb_string()) {
 933                 if ($length)
 934                         return mb_substr($c, $start, $length);
 935                 else
 936                         return mb_substr($c, $start);
 937         }
 938
 939         // Version manuelle (cf. ci-dessous)
 940         return spip_substr_manuelle($c, $start, $length);
 941 }
 942
 943
 944 /**
 945  * Coupe un texte comme mb_substr()
 946  *
 947  * Version manuelle de substr utf8, pour php vieux et/ou mal installe
 948  *
 949  * @link http://fr.php.net/manual/fr/function.mb-substr.php
 950  * @used-by spip_substr()
 951  *
 952  * @param string $c         Le texte
 953  * @param int $start        Début
 954  * @param null|int $length  Longueur ou fin
 955  * @return string
 956  *     Le texte coupé
 957 **/
 958 function spip_substr_manuelle($c, $start, $length = NULL) {
 959
 960         // Cas pathologique
 961         if ($length === 0)
 962                 return '';
 963
 964         // S'il y a un demarrage, on se positionne
 965         if ($start > 0)
 966                 $c = substr($c, strlen(spip_substr_manuelle($c, 0, $start)));
 967         elseif ($start < 0)
 968                 return spip_substr_manuelle($c, spip_strlen($c)+$start, $length);
 969
 970         if (!$length)
 971                 return $c;
 972
 973         if ($length > 0) {
 974                 // on prend n fois la longueur desiree, pour etre surs d'avoir tout
 975                 // (un caractere utf-8 prenant au maximum n bytes)
 976                 $n = 0; while (preg_match(',[\x80-\xBF]{'.(++$n).'},', $c));
 977                 $c = substr($c, 0, $n*$length);
 978                 // puis, tant qu'on est trop long, on coupe...
 979                 while (($l = spip_strlen($c)) > $length)
 980                         $c = substr($c, 0, $length - $l);
 981                 return $c;
 982         }
 983
 984         // $length < 0
 985         return spip_substr_manuelle($c, 0, spip_strlen($c)+$length);
 986 }
 987
 988 /**
 989  * Rend majuscule le premier caractère d'une chaîne utf-8
 990  *
 991  * Version utf-8 d'ucfirst
 992  *
 993  * @param string $c
 994  *     La chaîne à transformer
 995  * @return string
 996  *     La chaîne avec une majuscule sur le premier mot
 997  */
 998 function spip_ucfirst($c){
 999         // Si ce n'est pas utf-8, utiliser ucfirst
1000         if ($GLOBALS['meta']['charset'] != 'utf-8')
1001                 return ucfirst($c);
1002         // Si on n'a pas mb_* on utilise ucfirst
1003         if (!init_mb_string())
1004                 return ucfirst($c);
1005
1006         $lettre1 = mb_strtoupper(spip_substr($c,0,1));
1007         return $lettre1.spip_substr($c,1);
1008 }
1009
1010 /**
1011  * Retourne la longueur d'une chaîne utf-8
1012  *
1013  * Version utf-8 de strlen
1014  *
1015  * @param string $c
1016  *     La chaîne à compter
1017  * @return int
1018  *     Longueur de la chaîne
1019  */
1020 function spip_strlen($c) {
1021         // On transforme les sauts de ligne pour ne pas compter deux caractères
1022         $c = str_replace("\r\n", "\n", $c);
1023
1024         // Si ce n'est pas utf-8, utiliser strlen
1025         if ($GLOBALS['meta']['charset'] != 'utf-8')
1026                 return strlen($c);
1027
1028         // Sinon, utiliser mb_strlen() si disponible
1029         if (init_mb_string())
1030                 return mb_strlen($c);
1031
1032         // Methode manuelle : on supprime les bytes 10......,
1033         // on compte donc les ascii (0.......) et les demarrages
1034         // de caracteres utf-8 (11......)
1035         return strlen(preg_replace(',[\x80-\xBF],S', '', $c));
1036 }
1037
1038 // Initialisation
1039 $GLOBALS['CHARSET'] = Array();
1040
1041 // noter a l'occasion dans la meta pcre_u notre capacite a utiliser le flag /u
1042 // dans les preg_replace pour ne pas casser certaines lettres accentuees :
1043 // en utf-8 chr(195).chr(160) = a` alors qu'en iso-latin chr(160) = nbsp
1044 if (!isset($GLOBALS['meta']['pcre_u'])
1045   OR (isset($_GET['var_mode']) AND !isset($_GET['var_profile']))) {
1046         include_spip('inc/meta');
1047         ecrire_meta('pcre_u',
1048                 $u = ($GLOBALS['meta']['charset'] == 'utf-8'
1049                 AND test_pcre_unicode())
1050                         ? 'u' :''
1051         );
1052 }
1053
1054 ?>