www/ecrire/inc/charsets.php

   1 <?php
   2
   3 /***************************************************************************\
   4  *  SPIP, Systeme de publication pour l'internet                           *
   5  *                                                                         *
   6  *  Copyright (c) 2001-2017                                                *
   7  *  Arnaud Martin, Antoine Pitrou, Philippe Riviere, Emmanuel Saint-James  *
   8  *                                                                         *
   9  *  Ce programme est un logiciel libre distribue sous licence GNU/GPL.     *
  10  *  Pour plus de details voir le fichier COPYING.txt ou l'aide en ligne.   *
  11 \***************************************************************************/
  12
  13 /**
  14  * Gestion des charsets et des conversions
  15  *
  16  * Ce fichier contient les fonctions relatives à la gestion de charsets,
  17  * à la conversion de textes dans différents charsets et
  18  * propose des fonctions émulant la librairie mb si elle est absente
  19  *
  20  * @package SPIP\Core\Texte\Charsets
  21  **/
  22
  23 // securité
  24 if (!defined('_ECRIRE_INC_VERSION')) {
  25         return;
  26 }
  27
  28 // se faciliter la lecture du charset
  29 include_spip('inc/config');
  30
  31 /**
  32  * Charge en mémoire la liste des caractères d'un charset
  33  *
  34  * Charsets supportés en natif : voir les tables dans ecrire/charsets/
  35  * Les autres charsets sont supportés via mbstring()
  36  *
  37  * @param string $charset
  38  *     Charset à charger.
  39  *     Par défaut (AUTO), utilise le charset du site
  40  * @return string|bool
  41  *     - Nom du charset
  42  *     - false si le charset n'est pas décrit dans le répertoire charsets/
  43  **/
  44 function load_charset($charset = 'AUTO') {
  45         if ($charset == 'AUTO') {
  46                 $charset = $GLOBALS['meta']['charset'];
  47         }
  48         $charset = trim(strtolower($charset));
  49         if (isset($GLOBALS['CHARSET'][$charset])) {
  50                 return $charset;
  51         }
  52
  53         if ($charset == 'utf-8') {
  54                 $GLOBALS['CHARSET'][$charset] = array();
  55
  56                 return $charset;
  57         }
  58
  59         // Quelques synonymes
  60         if ($charset == '') {
  61                 $charset = 'iso-8859-1';
  62         } else {
  63                 if ($charset == 'windows-1250') {
  64                         $charset = 'cp1250';
  65                 } else {
  66                         if ($charset == 'windows-1251') {
  67                                 $charset = 'cp1251';
  68                         } else {
  69                                 if ($charset == 'windows-1256') {
  70                                         $charset = 'cp1256';
  71                                 }
  72                         }
  73                 }
  74         }
  75
  76         if (find_in_path($charset . '.php', 'charsets/', true)) {
  77                 return $charset;
  78         } else {
  79                 spip_log("Erreur: pas de fichier de conversion 'charsets/$charset'");
  80                 $GLOBALS['CHARSET'][$charset] = array();
  81
  82                 return false;
  83         }
  84 }
  85
  86
  87 /**
  88  * Vérifier qu'on peut utiliser mb_string
  89  *
  90  * @return bool
  91  *     true si toutes les fonctions mb nécessaires sont présentes
  92  **/
  93 function init_mb_string() {
  94         static $mb;
  95
  96         // verifier que tout est present (fonctions mb_string pour php >= 4.0.6)
  97         // et que le charset interne est connu de mb_string
  98         if (!$mb) {
  99                 if (function_exists('mb_internal_encoding')
 100                         and function_exists('mb_detect_order')
 101                         and function_exists('mb_substr')
 102                         and function_exists('mb_strlen')
 103                         and function_exists('mb_strtolower')
 104                         and function_exists('mb_strtoupper')
 105                         and function_exists('mb_encode_mimeheader')
 106                         and function_exists('mb_encode_numericentity')
 107                         and function_exists('mb_decode_numericentity')
 108                         and mb_detect_order(lire_config('charset', _DEFAULT_CHARSET))
 109                 ) {
 110                         mb_internal_encoding('utf-8');
 111                         $mb = 1;
 112                 } else {
 113                         $mb = -1;
 114                 }
 115         }
 116
 117         return ($mb == 1);
 118 }
 119
 120 /**
 121  * Test le fonctionnement correct d'iconv
 122  *
 123  * Celui-ci coupe sur certaines versions la chaine
 124  * quand un caractère n'appartient pas au charset
 125  *
 126  * @link http://php.net/manual/fr/function.iconv.php
 127  *
 128  * @return bool
 129  *     true si iconv fonctionne correctement
 130  **/
 131 function test_iconv() {
 132         static $iconv_ok;
 133
 134         if (!$iconv_ok) {
 135                 if (!function_exists('iconv')) {
 136                         $iconv_ok = -1;
 137                 } else {
 138                         if (utf_32_to_unicode(@iconv('utf-8', 'utf-32', 'chaine de test')) == 'chaine de test') {
 139                                 $iconv_ok = 1;
 140                         } else {
 141                                 $iconv_ok = -1;
 142                         }
 143                 }
 144         }
 145
 146         return ($iconv_ok == 1);
 147 }
 148
 149
 150 /**
 151  * Test de fonctionnement du support UTF-8 dans PCRE
 152  *
 153  * Contournement bug Debian Woody
 154  *
 155  * @return bool
 156  *     true si PCRE supporte l'UTF-8 correctement
 157  **/
 158 function test_pcre_unicode() {
 159         static $pcre_ok = 0;
 160
 161         if (!$pcre_ok) {
 162                 $s = " " . chr(195) . chr(169) . "t" . chr(195) . chr(169) . " ";
 163                 if (preg_match(',\W...\W,u', $s)) {
 164                         $pcre_ok = 1;
 165                 } else {
 166                         $pcre_ok = -1;
 167                 }
 168         }
 169
 170         return $pcre_ok == 1;
 171 }
 172
 173 /**
 174  * Renvoie une plage de caractères alphanumeriques unicodes (incomplet...)
 175  *
 176  * Retourne pour une expression rationnelle une plage
 177  * de caractères alphanumériques à utiliser entre crochets [$plage]
 178  *
 179  * @internal
 180  *    N'est pas utilisé
 181  *    Servait à inc/ortho passé dans le grenier
 182  * @return string
 183  *    Plage de caractères
 184  **/
 185 function pcre_lettres_unicode() {
 186         static $plage_unicode;
 187
 188         if (!$plage_unicode) {
 189                 if (test_pcre_unicode()) {
 190                         // cf. http://www.unicode.org/charts/
 191                         $plage_unicode = '\w' // iso-latin
 192                                 . '\x{100}-\x{24f}' // europeen etendu
 193                                 . '\x{300}-\x{1cff}' // des tas de trucs
 194                         ;
 195                 } else {
 196                         // fallback a trois sous
 197                         $plage_unicode = '\w';
 198                 }
 199         }
 200
 201         return $plage_unicode;
 202 }
 203
 204
 205 /**
 206  * Renvoie une plage de caractères de ponctuation unicode de 0x2000 a 0x206F
 207  *
 208  * Retourne pour une expression rationnelle une plage
 209  * de caractères de ponctuation à utiliser entre crochets [$plage]
 210  * (i.e. de 226-128-128 a 226-129-176)
 211  *
 212  * @internal
 213  *    N'est pas utilisé
 214  *    Servait à inc/ortho passé dans le grenier
 215  * @return string
 216  *    Plage de caractères
 217  **/
 218 function plage_punct_unicode() {
 219         return '\xE2(\x80[\x80-\xBF]|\x81[\x80-\xAF])';
 220 }
 221
 222 /**
 223  * Corriger des caractères non-conformes : 128-159
 224  *
 225  * Cf. charsets/iso-8859-1.php (qu'on recopie ici pour aller plus vite)
 226  * On peut passer un charset cible en parametre pour accelerer le passage iso-8859-1 -> autre charset
 227  *
 228  * @param string|array $texte
 229  *     Le texte à corriger
 230  * @param string $charset
 231  *     Charset d'origine du texte
 232  *     Par défaut (AUTO) utilise le charset du site
 233  * @param string $charset_cible
 234  *     Charset de destination (unicode par défaut)
 235  * @return string|array
 236  *     Texte corrigé
 237  **/
 238 function corriger_caracteres_windows($texte, $charset = 'AUTO', $charset_cible = 'unicode') {
 239         static $trans;
 240
 241         if (is_array($texte)) {
 242                 return array_map('corriger_caracteres_windows', $texte);
 243         }
 244
 245         if ($charset == 'AUTO') {
 246                 $charset = lire_config('charset', _DEFAULT_CHARSET);
 247         }
 248         if ($charset == 'utf-8') {
 249                 $p = chr(194);
 250                 if (strpos($texte, $p) == false) {
 251                         return $texte;
 252                 }
 253         } else {
 254                 if ($charset == 'iso-8859-1') {
 255                         $p = '';
 256                 } else {
 257                         return $texte;
 258                 }
 259         }
 260
 261         if (!isset($trans[$charset][$charset_cible])) {
 262                 $trans[$charset][$charset_cible] = array(
 263                         $p . chr(128) => "&#8364;",
 264                         $p . chr(129) => ' ', # pas affecte
 265                         $p . chr(130) => "&#8218;",
 266                         $p . chr(131) => "&#402;",
 267                         $p . chr(132) => "&#8222;",
 268                         $p . chr(133) => "&#8230;",
 269                         $p . chr(134) => "&#8224;",
 270                         $p . chr(135) => "&#8225;",
 271                         $p . chr(136) => "&#710;",
 272                         $p . chr(137) => "&#8240;",
 273                         $p . chr(138) => "&#352;",
 274                         $p . chr(139) => "&#8249;",
 275                         $p . chr(140) => "&#338;",
 276                         $p . chr(141) => ' ', # pas affecte
 277                         $p . chr(142) => "&#381;",
 278                         $p . chr(143) => ' ', # pas affecte
 279                         $p . chr(144) => ' ', # pas affecte
 280                         $p . chr(145) => "&#8216;",
 281                         $p . chr(146) => "&#8217;",
 282                         $p . chr(147) => "&#8220;",
 283                         $p . chr(148) => "&#8221;",
 284                         $p . chr(149) => "&#8226;",
 285                         $p . chr(150) => "&#8211;",
 286                         $p . chr(151) => "&#8212;",
 287                         $p . chr(152) => "&#732;",
 288                         $p . chr(153) => "&#8482;",
 289                         $p . chr(154) => "&#353;",
 290                         $p . chr(155) => "&#8250;",
 291                         $p . chr(156) => "&#339;",
 292                         $p . chr(157) => ' ', # pas affecte
 293                         $p . chr(158) => "&#382;",
 294                         $p . chr(159) => "&#376;",
 295                 );
 296                 if ($charset_cible != 'unicode') {
 297                         foreach ($trans[$charset][$charset_cible] as $k => $c) {
 298                                 $trans[$charset][$charset_cible][$k] = unicode2charset($c, $charset_cible);
 299                         }
 300                 }
 301         }
 302
 303         return @str_replace(array_keys($trans[$charset][$charset_cible]),
 304                 array_values($trans[$charset][$charset_cible]), $texte);
 305 }
 306
 307
 308 /**
 309  * Transforme les entités HTML en unicode
 310  *
 311  * Transforme les &eacute; en &#123;
 312  *
 313  * @param string $texte
 314  *     Texte à convertir
 315  * @param bool $secure
 316  *     true pour *ne pas convertir* les caracteres malins &lt; &amp; etc.
 317  * @return string
 318  *     Texte converti
 319  **/
 320 function html2unicode($texte, $secure = false) {
 321         if (strpos($texte, '&') === false) {
 322                 return $texte;
 323         }
 324         static $trans = array();
 325         if (!$trans) {
 326                 load_charset('html');
 327                 foreach ($GLOBALS['CHARSET']['html'] as $key => $val) {
 328                         $trans["&$key;"] = $val;
 329                 }
 330         }
 331
 332         if ($secure) {
 333                 return str_replace(array_keys($trans), array_values($trans), $texte);
 334         } else {
 335                 return str_replace(array('&amp;', '&quot;', '&lt;', '&gt;'), array('&', '"', '<', '>'),
 336                         str_replace(array_keys($trans), array_values($trans), $texte)
 337                 );
 338         }
 339 }
 340
 341
 342 /**
 343  * Transforme les entités mathématiques (MathML) en unicode
 344  *
 345  * Transforme &angle; en &#x2220; ainsi que toutes autres entités mathématiques
 346  *
 347  * @param string $texte
 348  *     Texte à convertir
 349  * @return string
 350  *     Texte converti
 351  **/
 352 function mathml2unicode($texte) {
 353         static $trans;
 354         if (!$trans) {
 355                 load_charset('mathml');
 356
 357                 foreach ($GLOBALS['CHARSET']['mathml'] as $key => $val) {
 358                         $trans["&$key;"] = $val;
 359                 }
 360         }
 361
 362         return str_replace(array_keys($trans), array_values($trans), $texte);
 363 }
 364
 365
 366 /**
 367  * Transforme une chaine en entites unicode &#129;
 368  *
 369  * Utilise la librairie mb si elle est présente.
 370  *
 371  * @internal
 372  *     Note: l'argument $forcer est obsolete : il visait a ne pas
 373  *     convertir les accents iso-8859-1
 374  *
 375  * @param string $texte
 376  *     Texte à convertir
 377  * @param string $charset
 378  *     Charset actuel du texte
 379  *     Par défaut (AUTO), le charset est celui du site.
 380  * @return string
 381  *     Texte converti en unicode
 382  **/
 383 function charset2unicode($texte, $charset = 'AUTO' /* $forcer: obsolete*/) {
 384         static $trans;
 385
 386         if ($charset == 'AUTO') {
 387                 $charset = lire_config('charset', _DEFAULT_CHARSET);
 388         }
 389
 390         if ($charset == '') {
 391                 $charset = 'iso-8859-1';
 392         }
 393         $charset = strtolower($charset);
 394
 395         switch ($charset) {
 396                 case 'utf-8':
 397                 case 'utf8':
 398                         return utf_8_to_unicode($texte);
 399
 400                 case 'iso-8859-1':
 401                         $texte = corriger_caracteres_windows($texte, 'iso-8859-1');
 402                 // pas de break; ici, on suit sur default:
 403
 404                 default:
 405                         // mbstring presente ?
 406                         if (init_mb_string()) {
 407                                 if ($order = mb_detect_order() # mb_string connait-il $charset?
 408                                         and mb_detect_order($charset)
 409                                 ) {
 410                                         $s = mb_convert_encoding($texte, 'utf-8', $charset);
 411                                         if ($s && $s != $texte) {
 412                                                 return utf_8_to_unicode($s);
 413                                         }
 414                                 }
 415                                 mb_detect_order($order); # remettre comme precedemment
 416                         }
 417
 418                         // Sinon, peut-etre connaissons-nous ce charset ?
 419                         if (!isset($trans[$charset])) {
 420                                 if ($cset = load_charset($charset)
 421                                         and is_array($GLOBALS['CHARSET'][$cset])
 422                                 ) {
 423                                         foreach ($GLOBALS['CHARSET'][$cset] as $key => $val) {
 424                                                 $trans[$charset][chr($key)] = '&#' . $val . ';';
 425                                         }
 426                                 }
 427                         }
 428                         if (count($trans[$charset])) {
 429                                 return str_replace(array_keys($trans[$charset]), array_values($trans[$charset]), $texte);
 430                         }
 431
 432                         // Sinon demander a iconv (malgre le fait qu'il coupe quand un
 433                         // caractere n'appartient pas au charset, mais c'est un probleme
 434                         // surtout en utf-8, gere ci-dessus)
 435                         if (test_iconv()) {
 436                                 $s = iconv($charset, 'utf-32le', $texte);
 437                                 if ($s) {
 438                                         return utf_32_to_unicode($s);
 439                                 }
 440                         }
 441
 442                         // Au pire ne rien faire
 443                         spip_log("erreur charset '$charset' non supporte");
 444
 445                         return $texte;
 446         }
 447 }
 448
 449
 450 /**
 451  * Transforme les entites unicode &#129; dans le charset specifie
 452  *
 453  * Attention on ne transforme pas les entites < &#128; car si elles
 454  * ont ete encodees ainsi c'est a dessein
 455  *
 456  * @param string $texte
 457  *     Texte unicode à transformer
 458  * @param string $charset
 459  *     Charset à appliquer au texte
 460  *     Par défaut (AUTO), le charset sera celui du site.
 461  * @return string
 462  *     Texte transformé dans le charset souhaité
 463  **/
 464 function unicode2charset($texte, $charset = 'AUTO') {
 465         static $CHARSET_REVERSE;
 466         static $trans = array();
 467
 468         if ($charset == 'AUTO') {
 469                 $charset = lire_config('charset', _DEFAULT_CHARSET);
 470         }
 471
 472         switch ($charset) {
 473                 case 'utf-8':
 474                         return unicode_to_utf_8($texte);
 475                         break;
 476
 477                 default:
 478                         $charset = load_charset($charset);
 479
 480                         if (!is_array($CHARSET_REVERSE[$charset])) {
 481                                 $CHARSET_REVERSE[$charset] = array_flip($GLOBALS['CHARSET'][$charset]);
 482                         }
 483
 484                         if (!isset($trans[$charset])) {
 485                                 $trans[$charset] = array();
 486                                 $t = &$trans[$charset];
 487                                 for ($e = 128; $e < 255; $e++) {
 488                                         $h = dechex($e);
 489                                         if ($s = isset($CHARSET_REVERSE[$charset][$e])) {
 490                                                 $s = $CHARSET_REVERSE[$charset][$e];
 491                                                 $t['&#' . $e . ';'] = $t['&#0' . $e . ';'] = $t['&#00' . $e . ';'] = chr($s);
 492                                                 $t['&#x' . $h . ';'] = $t['&#x0' . $h . ';'] = $t['&#x00' . $h . ';'] = chr($s);
 493                                         } else {
 494                                                 $t['&#' . $e . ';'] = $t['&#0' . $e . ';'] = $t['&#00' . $e . ';'] = chr($e);
 495                                                 $t['&#x' . $h . ';'] = $t['&#x0' . $h . ';'] = $t['&#x00' . $h . ';'] = chr($e);
 496                                         }
 497                                 }
 498                         }
 499                         $texte = str_replace(array_keys($trans[$charset]), array_values($trans[$charset]), $texte);
 500
 501                         return $texte;
 502         }
 503 }
 504
 505
 506 /**
 507  * Importer un texte depuis un charset externe vers le charset du site
 508  *
 509  * Les caractères non resolus sont transformés en `&#123`;
 510  *
 511  * @param string $texte
 512  *     Texte unicode à importer
 513  * @param string $charset
 514  *     Charset d'origine du texte
 515  *     Par défaut (AUTO), le charset d'origine est celui du site.
 516  * @return string
 517  *     Texte transformé dans le charset site
 518  **/
 519 function importer_charset($texte, $charset = 'AUTO') {
 520         static $trans = array();
 521         // on traite le cas le plus frequent iso-8859-1 vers utf directement pour aller plus vite !
 522         if (($charset == 'iso-8859-1') && ($GLOBALS['meta']['charset'] == 'utf-8')) {
 523                 $texte = corriger_caracteres_windows($texte, 'iso-8859-1', $GLOBALS['meta']['charset']);
 524                 if (init_mb_string()) {
 525                         if ($order = mb_detect_order() # mb_string connait-il $charset?
 526                                 and mb_detect_order($charset)
 527                         ) {
 528                                 $s = mb_convert_encoding($texte, 'utf-8', $charset);
 529                         }
 530                         mb_detect_order($order); # remettre comme precedemment
 531                         return $s;
 532                 }
 533                 // Sinon, peut-etre connaissons-nous ce charset ?
 534                 if (!isset($trans[$charset])) {
 535                         if ($cset = load_charset($charset)
 536                                 and is_array($GLOBALS['CHARSET'][$cset])
 537                         ) {
 538                                 foreach ($GLOBALS['CHARSET'][$cset] as $key => $val) {
 539                                         $trans[$charset][chr($key)] = unicode2charset('&#' . $val . ';');
 540                                 }
 541                         }
 542                 }
 543                 if (count($trans[$charset])) {
 544                         return str_replace(array_keys($trans[$charset]), array_values($trans[$charset]), $texte);
 545                 }
 546
 547                 return $texte;
 548         }
 549
 550         return unicode2charset(charset2unicode($texte, $charset));
 551 }
 552
 553
 554 /**
 555  * Transforme un texte UTF-8 en unicode
 556  *
 557  * Utilise la librairie mb si présente
 558  *
 559  * @param string $source
 560  *    Texte UTF-8 à transformer
 561  * @return string
 562  *    Texte transformé en unicode
 563  **/
 564 function utf_8_to_unicode($source) {
 565
 566         // mb_string : methode rapide
 567         if (init_mb_string()) {
 568                 $convmap = array(0x7F, 0xFFFFFF, 0x0, 0xFFFFFF);
 569
 570                 return mb_encode_numericentity($source, $convmap, 'UTF-8');
 571         }
 572
 573         // Sinon methode pas a pas
 574         static $decrement;
 575         static $shift;
 576
 577         // Cf. php.net, par Ronen. Adapte pour compatibilite < php4
 578         if (!is_array($decrement)) {
 579                 // array used to figure what number to decrement from character order value
 580                 // according to number of characters used to map unicode to ascii by utf-8
 581                 $decrement[4] = 240;
 582                 $decrement[3] = 224;
 583                 $decrement[2] = 192;
 584                 $decrement[1] = 0;
 585                 // the number of bits to shift each charNum by
 586                 $shift[1][0] = 0;
 587                 $shift[2][0] = 6;
 588                 $shift[2][1] = 0;
 589                 $shift[3][0] = 12;
 590                 $shift[3][1] = 6;
 591                 $shift[3][2] = 0;
 592                 $shift[4][0] = 18;
 593                 $shift[4][1] = 12;
 594                 $shift[4][2] = 6;
 595                 $shift[4][3] = 0;
 596         }
 597
 598         $pos = 0;
 599         $len = strlen($source);
 600         $encodedString = '';
 601         while ($pos < $len) {
 602                 $char = '';
 603                 $ischar = false;
 604                 $asciiPos = ord(substr($source, $pos, 1));
 605                 if (($asciiPos >= 240) && ($asciiPos <= 255)) {
 606                         // 4 chars representing one unicode character
 607                         $thisLetter = substr($source, $pos, 4);
 608                         $pos += 4;
 609                 } else {
 610                         if (($asciiPos >= 224) && ($asciiPos <= 239)) {
 611                                 // 3 chars representing one unicode character
 612                                 $thisLetter = substr($source, $pos, 3);
 613                                 $pos += 3;
 614                         } else {
 615                                 if (($asciiPos >= 192) && ($asciiPos <= 223)) {
 616                                         // 2 chars representing one unicode character
 617                                         $thisLetter = substr($source, $pos, 2);
 618                                         $pos += 2;
 619                                 } else {
 620                                         // 1 char (lower ascii)
 621                                         $thisLetter = substr($source, $pos, 1);
 622                                         $pos += 1;
 623                                         $char = $thisLetter;
 624                                         $ischar = true;
 625                                 }
 626                         }
 627                 }
 628
 629                 if ($ischar) {
 630                         $encodedString .= $char;
 631                 } else {  // process the string representing the letter to a unicode entity
 632                         $thisLen = strlen($thisLetter);
 633                         $thisPos = 0;
 634                         $decimalCode = 0;
 635                         while ($thisPos < $thisLen) {
 636                                 $thisCharOrd = ord(substr($thisLetter, $thisPos, 1));
 637                                 if ($thisPos == 0) {
 638                                         $charNum = intval($thisCharOrd - $decrement[$thisLen]);
 639                                         $decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
 640                                 } else {
 641                                         $charNum = intval($thisCharOrd - 128);
 642                                         $decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
 643                                 }
 644                                 $thisPos++;
 645                         }
 646                         $encodedLetter = "&#" . preg_replace('/^0+/', '', $decimalCode) . ';';
 647                         $encodedString .= $encodedLetter;
 648                 }
 649         }
 650
 651         return $encodedString;
 652 }
 653
 654 /**
 655  * Transforme un texte UTF-32 en unicode
 656  *
 657  * UTF-32 ne sert plus que si on passe par iconv, c'est-a-dire quand
 658  * mb_string est absente ou ne connait pas notre charset.
 659  *
 660  * Mais on l'optimise quand meme par mb_string
 661  * => tout ca sera osolete quand on sera surs d'avoir mb_string
 662  *
 663  * @param string $source
 664  *    Texte UTF-8 à transformer
 665  * @return string
 666  *    Texte transformé en unicode
 667  **/
 668 function utf_32_to_unicode($source) {
 669
 670         // mb_string : methode rapide
 671         if (init_mb_string()) {
 672                 $convmap = array(0x7F, 0xFFFFFF, 0x0, 0xFFFFFF);
 673                 $source = mb_encode_numericentity($source, $convmap, 'UTF-32LE');
 674
 675                 return str_replace(chr(0), '', $source);
 676         }
 677
 678         // Sinon methode lente
 679         $texte = '';
 680         while ($source) {
 681                 $words = unpack("V*", substr($source, 0, 1024));
 682                 $source = substr($source, 1024);
 683                 foreach ($words as $word) {
 684                         if ($word < 128) {
 685                                 $texte .= chr($word);
 686                         } // ignorer le BOM - http://www.unicode.org/faq/utf_bom.html
 687                         else {
 688                                 if ($word != 65279) {
 689                                         $texte .= '&#' . $word . ';';
 690                                 }
 691                         }
 692                 }
 693         }
 694
 695         return $texte;
 696
 697 }
 698
 699
 700 /**
 701  * Transforme un numéro unicode en caractère utf-8
 702  *
 703  * Ce bloc provient de php.net
 704  *
 705  * @author Ronen
 706  *
 707  * @param int $num
 708  *    Numéro de l'entité unicode
 709  * @return char
 710  *    Caractère utf8 si trouvé, '' sinon
 711  **/
 712 function caractere_utf_8($num) {
 713         $num = intval($num);
 714         if ($num < 128) {
 715                 return chr($num);
 716         }
 717         if ($num < 2048) {
 718                 return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
 719         }
 720         if ($num < 65536) {
 721                 return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 722         }
 723         if ($num < 1114112) {
 724                 return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 725         }
 726
 727         return '';
 728 }
 729
 730 /**
 731  * Convertit un texte unicode en utf-8
 732  *
 733  * @param string $texte
 734  *     Texte à convertir
 735  * @return string
 736  *     Texte converti
 737  **/
 738 function unicode_to_utf_8($texte) {
 739
 740         // 1. Entites &#128; et suivantes
 741         $vu = array();
 742         if (preg_match_all(',&#0*([1-9][0-9][0-9]+);,S',
 743                 $texte, $regs, PREG_SET_ORDER)) {
 744                 foreach ($regs as $reg) {
 745                         if ($reg[1] > 127 and !isset($vu[$reg[0]])) {
 746                                 $vu[$reg[0]] = caractere_utf_8($reg[1]);
 747                         }
 748                 }
 749         }
 750         //$texte = str_replace(array_keys($vu), array_values($vu), $texte);
 751
 752         // 2. Entites > &#xFF;
 753         //$vu = array();
 754         if (preg_match_all(',&#x0*([1-9a-f][0-9a-f][0-9a-f]+);,iS',
 755                 $texte, $regs, PREG_SET_ORDER)) {
 756                 foreach ($regs as $reg) {
 757                         if (!isset($vu[$reg[0]])) {
 758                                 $vu[$reg[0]] = caractere_utf_8(hexdec($reg[1]));
 759                         }
 760                 }
 761         }
 762
 763         return str_replace(array_keys($vu), array_values($vu), $texte);
 764
 765 }
 766
 767 /**
 768  * Convertit les unicode &#264; en javascript \u0108
 769  *
 770  * @param string $texte
 771  *     Texte à convertir
 772  * @return string
 773  *     Texte converti
 774  **/
 775 function unicode_to_javascript($texte) {
 776         $vu = array();
 777         while (preg_match(',&#0*([0-9]+);,S', $texte, $regs) and !isset($vu[$regs[1]])) {
 778                 $num = $regs[1];
 779                 $vu[$num] = true;
 780                 $s = '\u' . sprintf("%04x", $num);
 781                 $texte = str_replace($regs[0], $s, $texte);
 782         }
 783
 784         return $texte;
 785 }
 786
 787 /**
 788  * Convertit les %uxxxx (envoyés par javascript) en &#yyy unicode
 789  *
 790  * @param string $texte
 791  *     Texte à convertir
 792  * @return string
 793  *     Texte converti
 794  **/
 795 function javascript_to_unicode($texte) {
 796         while (preg_match(",%u([0-9A-F][0-9A-F][0-9A-F][0-9A-F]),", $texte, $regs)) {
 797                 $texte = str_replace($regs[0], "&#" . hexdec($regs[1]) . ";", $texte);
 798         }
 799
 800         return $texte;
 801 }
 802
 803 /**
 804  * Convertit les %E9 (envoyés par le browser) en chaîne du charset du site (binaire)
 805  *
 806  * @param string $texte
 807  *     Texte à convertir
 808  * @return string
 809  *     Texte converti
 810  **/
 811 function javascript_to_binary($texte) {
 812         while (preg_match(",%([0-9A-F][0-9A-F]),", $texte, $regs)) {
 813                 $texte = str_replace($regs[0], chr(hexdec($regs[1])), $texte);
 814         }
 815
 816         return $texte;
 817 }
 818
 819
 820 /**
 821  * Substition rapide de chaque graphème selon le charset sélectionné.
 822  *
 823  * @uses caractere_utf_8()
 824  *
 825  * @global array $CHARSET
 826  * @staticvar array $trans
 827  *
 828  * @param string $texte
 829  * @param string $charset
 830  * @param string $complexe
 831  * @return string
 832  */
 833 function translitteration_rapide($texte, $charset = 'AUTO', $complexe = '') {
 834         static $trans;
 835         if ($charset == 'AUTO') {
 836                 $charset = $GLOBALS['meta']['charset'];
 837         }
 838         if (!strlen($texte)) {
 839                 return $texte;
 840         }
 841
 842         $table_translit = 'translit' . $complexe;
 843
 844         // 2. Translitterer grace a la table predefinie
 845         if (!$trans[$complexe]) {
 846                 load_charset($table_translit);
 847                 foreach ($GLOBALS['CHARSET'][$table_translit] as $key => $val) {
 848                         $trans[$complexe][caractere_utf_8($key)] = $val;
 849                 }
 850         }
 851
 852         return str_replace(array_keys($trans[$complexe]), array_values($trans[$complexe]), $texte);
 853 }
 854
 855 /**
 856  * Translittération charset => ascii (pour l'indexation)
 857  *
 858  * Permet, entre autres, d’enlever les accents,
 859  * car la table ASCII non étendue ne les comporte pas.
 860  *
 861  * Attention les caractères non reconnus sont renvoyés en utf-8
 862  *
 863  * @uses corriger_caracteres()
 864  * @uses unicode_to_utf_8()
 865  * @uses html2unicode()
 866  * @uses charset2unicode()
 867  * @uses translitteration_rapide()
 868  *
 869  * @param string $texte
 870  * @param string $charset
 871  * @param string $complexe
 872  * @return string
 873  */
 874 function translitteration($texte, $charset = 'AUTO', $complexe = '') {
 875         // 0. Supprimer les caracteres illegaux
 876         include_spip('inc/filtres');
 877         $texte = corriger_caracteres($texte);
 878
 879         // 1. Passer le charset et les &eacute en utf-8
 880         $texte = unicode_to_utf_8(html2unicode(charset2unicode($texte, $charset, true)));
 881
 882         return translitteration_rapide($texte, $charset, $complexe);
 883 }
 884
 885 /**
 886  * Translittération complexe
 887  *
 888  * `&agrave;` est retourné sous la forme ``a` `` et pas `à`
 889  * mais si `$chiffre=true`, on retourne `a8` (vietnamien)
 890  *
 891  * @uses translitteration()
 892  * @param string $texte
 893  * @param bool $chiffres
 894  * @return string
 895  */
 896 function translitteration_complexe($texte, $chiffres = false) {
 897         $texte = translitteration($texte, 'AUTO', 'complexe');
 898
 899         if ($chiffres) {
 900                 $texte = preg_replace("/[aeiuoyd]['`?~.^+(-]{1,2}/eS",
 901                         "translitteration_chiffree('\\0')", $texte);
 902         }
 903
 904         return $texte;
 905 }
 906
 907 /**
 908  * Translittération chiffrée
 909  *
 910  * Remplace des caractères dans une chaîne par des chiffres
 911  *
 912  * @param string $car
 913  * @return string
 914  */
 915 function translitteration_chiffree($car) {
 916         return strtr($car, "'`?~.^+(-", "123456789");
 917 }
 918
 919
 920 /**
 921  * Reconnaitre le BOM utf-8 (0xEFBBBF)
 922  *
 923  * @param string $texte
 924  *    Texte dont on vérifie la présence du BOM
 925  * @return bool
 926  *    true s'il a un BOM
 927  **/
 928 function bom_utf8($texte) {
 929         return (substr($texte, 0, 3) == chr(0xEF) . chr(0xBB) . chr(0xBF));
 930 }
 931
 932 /**
 933  * Vérifie qu'une chaîne est en utf-8 valide
 934  *
 935  * Note: preg_replace permet de contourner un "stack overflow" sur PCRE
 936  *
 937  * @link http://us2.php.net/manual/fr/function.mb-detect-encoding.php#50087
 938  * @link http://w3.org/International/questions/qa-forms-utf-8.html
 939  *
 940  * @param string $string
 941  *     Texte dont on vérifie qu'il est de l'utf-8
 942  * @return bool
 943  *     true si c'est le cas
 944  **/
 945 function is_utf8($string) {
 946         return !strlen(
 947                 preg_replace(
 948                         ',[\x09\x0A\x0D\x20-\x7E]'            # ASCII
 949                         . '|[\xC2-\xDF][\x80-\xBF]'             # non-overlong 2-byte
 950                         . '|\xE0[\xA0-\xBF][\x80-\xBF]'         # excluding overlongs
 951                         . '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'  # straight 3-byte
 952                         . '|\xED[\x80-\x9F][\x80-\xBF]'         # excluding surrogates
 953                         . '|\xF0[\x90-\xBF][\x80-\xBF]{2}'      # planes 1-3
 954                         . '|[\xF1-\xF3][\x80-\xBF]{3}'          # planes 4-15
 955                         . '|\xF4[\x80-\x8F][\x80-\xBF]{2}'      # plane 16
 956                         . ',sS',
 957                         '', $string));
 958 }
 959
 960 /**
 961  * Vérifie qu'une chaîne est en ascii valide
 962  *
 963  * @param string $string
 964  *     Texte dont on vérifie qu'il est de l'ascii
 965  * @return bool
 966  *     true si c'est le cas
 967  **/
 968 function is_ascii($string) {
 969         return !strlen(
 970                 preg_replace(
 971                         ',[\x09\x0A\x0D\x20-\x7E],sS',
 972                         '', $string));
 973 }
 974
 975 /**
 976  * Transcode une page vers le charset du site
 977  *
 978  * Transcode une page (attrapée sur le web, ou un squelette) vers le
 979  * charset du site en essayant par tous les moyens de deviner son charset
 980  * (y compris dans les headers HTTP)
 981  *
 982  * @param string $texte
 983  *     Page à transcoder, dont on souhaite découvrir son charset
 984  * @param string $headers
 985  *     Éventuels headers HTTP liés à cette page
 986  * @return string
 987  *     Texte transcodé dans le charset du site
 988  **/
 989 function transcoder_page($texte, $headers = '') {
 990
 991         // Si tout est < 128 pas la peine d'aller plus loin
 992         if (is_ascii($texte)) {
 993                 #spip_log('charset: ascii');
 994                 return $texte;
 995         }
 996
 997         // Reconnaitre le BOM utf-8 (0xEFBBBF)
 998         if (bom_utf8($texte)) {
 999                 $charset = 'utf-8';
1000                 $texte = substr($texte, 3);
1001         } // charset precise par le contenu (xml)
1002         else {
1003                 if (preg_match(
1004                         ',<[?]xml[^>]*encoding[^>]*=[^>]*([-_a-z0-9]+?),UimsS', $texte, $regs)) {
1005                         $charset = trim(strtolower($regs[1]));
1006                 } // charset precise par le contenu (html)
1007                 else {
1008                         if (preg_match(
1009                                         ',<(meta|html|body)[^>]*charset[^>]*=[^>]*([-_a-z0-9]+?),UimsS',
1010                                         $texte, $regs)
1011                                 # eviter #CHARSET des squelettes
1012                                 and (($tmp = trim(strtolower($regs[2]))) != 'charset')
1013                         ) {
1014                                 $charset = $tmp;
1015                         } // charset de la reponse http
1016                         else {
1017                                 if (preg_match(',charset=([-_a-z0-9]+),i', $headers, $regs)) {
1018                                         $charset = trim(strtolower($regs[1]));
1019                                 } else {
1020                                         $charset = '';
1021                                 }
1022                         }
1023                 }
1024         }
1025         // normaliser les noms du shif-jis japonais
1026         if (preg_match(',^(x|shift)[_-]s?jis$,i', $charset)) {
1027                 $charset = 'shift-jis';
1028         }
1029
1030         if ($charset) {
1031                 spip_log("charset: $charset");
1032         } else {
1033                 // valeur par defaut
1034                 if (is_utf8($texte)) {
1035                         $charset = 'utf-8';
1036                 } else {
1037                         $charset = 'iso-8859-1';
1038                 }
1039                 spip_log("charset probable: $charset");
1040         }
1041
1042         return importer_charset($texte, $charset);
1043 }
1044
1045
1046 //
1047 // Gerer les outils mb_string
1048 //
1049
1050 /**
1051  * Coupe un texte selon substr()
1052  *
1053  * Coupe une chaîne en utilisant les outils mb* lorsque le site est en utf8
1054  *
1055  * @link http://fr.php.net/manual/fr/function.mb-substr.php
1056  * @link http://www.php.net/manual/fr/function.substr.php
1057  * @uses spip_substr_manuelle() si les fonctions php mb sont absentes
1058  *
1059  * @param string $c Le texte
1060  * @param int $start Début
1061  * @param null|int $length Longueur ou fin
1062  * @return string
1063  *     Le texte coupé
1064  **/
1065 function spip_substr($c, $start = 0, $length = null) {
1066         // Si ce n'est pas utf-8, utiliser substr
1067         if ($GLOBALS['meta']['charset'] != 'utf-8') {
1068                 if ($length) {
1069                         return substr($c, $start, $length);
1070                 } else {
1071                         substr($c, $start);
1072                 }
1073         }
1074
1075         // Si utf-8, voir si on dispose de mb_string
1076         if (init_mb_string()) {
1077                 if ($length) {
1078                         return mb_substr($c, $start, $length);
1079                 } else {
1080                         return mb_substr($c, $start);
1081                 }
1082         }
1083
1084         // Version manuelle (cf. ci-dessous)
1085         return spip_substr_manuelle($c, $start, $length);
1086 }
1087
1088
1089 /**
1090  * Coupe un texte comme mb_substr()
1091  *
1092  * Version manuelle de substr utf8, pour php vieux et/ou mal installe
1093  *
1094  * @link http://fr.php.net/manual/fr/function.mb-substr.php
1095  *
1096  * @param string $c Le texte
1097  * @param int $start Début
1098  * @param null|int $length Longueur ou fin
1099  * @return string
1100  *     Le texte coupé
1101  **/
1102 function spip_substr_manuelle($c, $start, $length = null) {
1103
1104         // Cas pathologique
1105         if ($length === 0) {
1106                 return '';
1107         }
1108
1109         // S'il y a un demarrage, on se positionne
1110         if ($start > 0) {
1111                 $c = substr($c, strlen(spip_substr_manuelle($c, 0, $start)));
1112         } elseif ($start < 0) {
1113                 return spip_substr_manuelle($c, spip_strlen($c) + $start, $length);
1114         }
1115
1116         if (!$length) {
1117                 return $c;
1118         }
1119
1120         if ($length > 0) {
1121                 // on prend n fois la longueur desiree, pour etre surs d'avoir tout
1122                 // (un caractere utf-8 prenant au maximum n bytes)
1123                 $n = 0;
1124                 while (preg_match(',[\x80-\xBF]{' . (++$n) . '},', $c)) {
1125                         ;
1126                 }
1127                 $c = substr($c, 0, $n * $length);
1128                 // puis, tant qu'on est trop long, on coupe...
1129                 while (($l = spip_strlen($c)) > $length) {
1130                         $c = substr($c, 0, $length - $l);
1131                 }
1132
1133                 return $c;
1134         }
1135
1136         // $length < 0
1137         return spip_substr_manuelle($c, 0, spip_strlen($c) + $length);
1138 }
1139
1140 /**
1141  * Rend majuscule le premier caractère d'une chaîne utf-8
1142  *
1143  * Version utf-8 d'ucfirst
1144  *
1145  * @param string $c
1146  *     La chaîne à transformer
1147  * @return string
1148  *     La chaîne avec une majuscule sur le premier mot
1149  */
1150 function spip_ucfirst($c) {
1151         // Si on n'a pas mb_* ou si ce n'est pas utf-8, utiliser ucfirst
1152         if (!init_mb_string() or $GLOBALS['meta']['charset'] != 'utf-8') {
1153                 return ucfirst($c);
1154         }
1155
1156         $lettre1 = mb_strtoupper(spip_substr($c, 0, 1));
1157
1158         return $lettre1 . spip_substr($c, 1);
1159 }
1160
1161 /**
1162  * Passe une chaîne utf-8 en minuscules
1163  *
1164  * Version utf-8 de strtolower
1165  *
1166  * @param string $c
1167  *     La chaîne à transformer
1168  * @return string
1169  *     La chaîne en minuscules
1170  */
1171 function spip_strtolower($c) {
1172         // Si on n'a pas mb_* ou si ce n'est pas utf-8, utiliser strtolower
1173         if (!init_mb_string() or $GLOBALS['meta']['charset'] != 'utf-8') {
1174                 return strtolower($c);
1175         }
1176
1177         return mb_strtolower($c);
1178 }
1179
1180 /**
1181  * Retourne la longueur d'une chaîne utf-8
1182  *
1183  * Version utf-8 de strlen
1184  *
1185  * @param string $c
1186  *     La chaîne à compter
1187  * @return int
1188  *     Longueur de la chaîne
1189  */
1190 function spip_strlen($c) {
1191         // On transforme les sauts de ligne pour ne pas compter deux caractères
1192         $c = str_replace("\r\n", "\n", $c);
1193
1194         // Si ce n'est pas utf-8, utiliser strlen
1195         if ($GLOBALS['meta']['charset'] != 'utf-8') {
1196                 return strlen($c);
1197         }
1198
1199         // Sinon, utiliser mb_strlen() si disponible
1200         if (init_mb_string()) {
1201                 return mb_strlen($c);
1202         }
1203
1204         // Methode manuelle : on supprime les bytes 10......,
1205         // on compte donc les ascii (0.......) et les demarrages
1206         // de caracteres utf-8 (11......)
1207         return strlen(preg_replace(',[\x80-\xBF],S', '', $c));
1208 }
1209
1210 // Initialisation
1211 $GLOBALS['CHARSET'] = array();
1212
1213 // noter a l'occasion dans la meta pcre_u notre capacite a utiliser le flag /u
1214 // dans les preg_replace pour ne pas casser certaines lettres accentuees :
1215 // en utf-8 chr(195).chr(160) = a` alors qu'en iso-latin chr(160) = nbsp
1216 if (!isset($GLOBALS['meta']['pcre_u'])
1217         or (isset($_GET['var_mode']) and !isset($_GET['var_profile']))
1218 ) {
1219         include_spip('inc/meta');
1220         ecrire_meta('pcre_u',
1221                 $u = (lire_config('charset', _DEFAULT_CHARSET) == 'utf-8'
1222                         and test_pcre_unicode())
1223                         ? 'u' : ''
1224         );
1225 }
1226
1227
1228 /**
1229  * Transforme une chaîne utf-8 en utf-8 sans "planes"
1230  * ce qui permet de la donner à MySQL "utf8", qui n'est pas un utf-8 complet
1231  * L'alternative serait d'utiliser utf8mb4
1232  *
1233  * @param string $x
1234  *     La chaîne à transformer
1235  * @return string
1236  *     La chaîne avec les caractères utf8 des hauts "planes" échappée
1237  *     en unicode : &#128169;
1238  */
1239 function utf8_noplanes($x) {
1240         $regexp_utf8_4bytes = '/(
1241       \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
1242    | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
1243    |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
1244 )/xS';
1245         if (preg_match_all($regexp_utf8_4bytes, $x, $z, PREG_PATTERN_ORDER)) {
1246                 foreach ($z[0] as $k) {
1247                         $ku = utf_8_to_unicode($k);
1248                         $x = str_replace($k, $ku, $x);
1249                 }
1250         }
1251
1252         return $x;
1253 }