[SPIP] v3.2.1-->v3.2.3
[lhc/web/www.git] / www / plugins-dist / safehtml / lib / safehtml / classes / safehtml.php
1 <?php
2
3 /**
4 * SafeHTML Parser
5 *
6 * @note
7 * Attention : Quelques modifications pour PHP 5.5 et 7
8 *
9 * @package SafeHTML
10 * @author Roman Ivanov <thingol@mail.ru>
11 * @author Miguel Vazquez Gocobachi <demrit@mx.gnu.org>
12 * @copyright 2004-2009 Roman Ivanov, Miguel Vazquez Gocobachi
13 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
14 * @version 1.3.10
15 * @link https://wackowiki.org/doc/Dev/Projects/SafeHTML
16 */
17
18
19 if (!defined('_ECRIRE_INC_VERSION')) return;
20
21 require_once(XML_HTMLSAX3 . 'HTMLSax3.php');
22
23 /**
24 *
25 * SafeHTML Parser
26 *
27 * This parser strips down all potentially dangerous content within HTML:
28 * <ul>
29 * <li>opening tag without its closing tag</li>
30 * <li>closing tag without its opening tag</li>
31 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
32 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
33 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
34 * <li>any of these attributes: on*, data*, dynsrc</li>
35 * <li>javascript:/vbscript:/about: etc. protocols</li>
36 * <li>expression/behavior etc. in styles</li>
37 * <li>any other active content</li>
38 * </ul>
39 * It also tries to convert code to XHTML valid, but htmltidy is far better
40 * solution for this task.
41 *
42 * <b>Example:</b>
43 * <pre>
44 * $parser =& new SafeHTML();
45 * $result = $parser->parse($doc);
46 * </pre>
47 *
48 * @category HTML
49 * @package SafeHTML
50 * @author Roman Ivanov <thingol@mail.ru>
51 * @copyright 1997-2005 Roman Ivanov
52 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
53 * @version Release: @package_version@
54 * @link http://pear.php.net/package/SafeHTML
55 */
56
57 class SafeHTML
58 {
59 /**
60 * Storage for resulting HTML output
61 *
62 * @var string
63 */
64 protected $xhtml = '';
65
66 /**
67 * Array of counters for each tag
68 *
69 * @var array
70 */
71 protected $counter = array();
72
73 /**
74 * Stack of unclosed tags
75 *
76 * @var array
77 */
78 protected $stack = array();
79
80 /**
81 * Array of counters for tags that must be deleted with all content
82 *
83 * @var array
84 */
85 protected $dcCounter = array();
86
87 /**
88 * Stack of unclosed tags that must be deleted with all content
89 *
90 * @var array
91 */
92 protected $dcStack = array();
93
94 /**
95 * Stores level of list (ol/ul) nesting
96 *
97 * @var int
98 */
99 protected $listScope = 0;
100
101 /**
102 * Stack of unclosed list tags
103 *
104 * @var array
105 */
106 protected $liStack = array();
107
108 /**
109 * Array of prepared regular expressions for protocols (schemas) matching
110 *
111 * @var array
112 */
113 protected $protoRegexps = array();
114
115 /**
116 * Array of prepared regular expressions for CSS matching
117 *
118 * @var array
119 */
120 protected $cssRegexps = array();
121
122 /**
123 * Should we perform UTF7 repacking or not?
124 *
125 * This repacking might replace completely normal strings such as "+31-" by illegal sequences,
126 * which cause the document to be truncated on saving to MySQL
127 *
128 * @var boolean
129 * @access public
130 */
131 var $repackUTF7 = true;
132
133 /**
134 * Allowed tags
135 *
136 * @var array
137 */
138 protected $allowTags = array();
139
140
141 /**
142 * List of single tags ("<tag />")
143 *
144 * @var array
145 */
146 public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
147
148 /**
149 * List of dangerous tags (such tags will be deleted)
150 *
151 * @var array
152 */
153 public $deleteTags = array(
154 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
155 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
156 'iframe', 'layer', 'link', 'meta', 'object', 'style',
157 'title', 'script',
158 );
159
160 /**
161 * List of dangerous tags (such tags will be deleted, and all content
162 * inside this tags will be also removed)
163 *
164 * @var array
165 */
166 public $deleteTagsContent = array('script', 'style', 'title', 'xml', );
167
168 /**
169 * Type of protocols filtering ('white' or 'black')
170 *
171 * @var string
172 */
173 public $protocolFiltering = 'white';
174
175 /**
176 * List of "dangerous" protocols (used for blacklist-filtering)
177 *
178 * @var array
179 */
180 public $blackProtocols = array(
181 'about', 'chrome', 'data', 'disk', 'hcp',
182 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
183 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
184 'res', 'resource', 'shell', 'vbscript', 'view-source',
185 'vnd.ms.radio', 'wysiwyg',
186 );
187
188 /**
189 * List of "safe" protocols (used for whitelist-filtering)
190 *
191 * @var array
192 */
193 public $whiteProtocols = array(
194 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
195 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
196 'xmpp', 'callto',
197 );
198
199 /**
200 * List of attributes that can contain protocols
201 *
202 * @var array
203 */
204 public $protocolAttributes = array(
205 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', 'formaction',
206 );
207
208 /**
209 * List of dangerous CSS keywords
210 *
211 * Whole style="" attribute will be removed, if parser will find one of
212 * these keywords
213 *
214 * @var array
215 */
216 public $cssKeywords = array(
217 'absolute', 'behavior', 'behaviour', 'content', 'expression',
218 'fixed', 'include-source', 'moz-binding',
219 );
220
221 /**
222 * List of tags that can have no "closing tag"
223 *
224 * @var array
225 * @deprecated XHTML does not allow such tags
226 */
227 public $noClose = array();
228
229 /**
230 * List of block-level tags that terminates paragraph
231 *
232 * Paragraph will be closed when this tags opened
233 *
234 * @var array
235 */
236 public $closeParagraph = array(
237 'address', 'article', 'aside', 'audio', 'blockquote', 'canvas',
238 'center', 'dd', 'dir', 'div', 'dl', 'dt',
239 'figure', 'figcaption', 'footer', 'h1', 'h2', 'h3',
240 'h4', 'h5', 'h6', 'header', 'hr', 'isindex',
241 'listing', 'main', 'marquee', 'menu', 'multicol', 'nav',
242 'ol', 'output', 'p', 'plaintext', 'pre', 'section',
243 'table', 'ul', 'video', 'xmp',
244 );
245
246 /**
247 * List of table tags, all table tags outside a table will be removed
248 *
249 * @var array
250 */
251 public $tableTags = array(
252 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
253 'thead', 'tr',
254 );
255
256 /**
257 * List of list tags
258 *
259 * @var array
260 */
261 public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
262
263 /**
264 * List of dangerous attributes
265 *
266 * @var array
267 */
268 public $attributes = array('dynsrc', 'id', 'name', );
269
270 /**
271 * List of allowed "namespaced" attributes
272 *
273 * @var array
274 */
275 public $attributesNS = array('xml:lang', );
276
277 /**
278 * Constructs class
279 *
280 * @access public
281 */
282 public function __construct()
283 {
284 //making regular expressions based on Proto & CSS arrays
285 foreach ($this->blackProtocols as $proto) {
286 $preg = "/[\s\x01-\x1F]*";
287 for ($i=0; $i<strlen($proto); $i++) {
288 $preg .= $proto{$i} . "[\s\x01-\x1F]*";
289 }
290 $preg .= ":/i";
291 $this->protoRegexps[] = $preg;
292 }
293
294 foreach ($this->cssKeywords as $css) {
295 $this->cssRegexps[] = '/' . $css . '/i';
296 }
297 return true;
298 }
299
300 /**
301 * Handles the writing of attributes - called from $this->openHandler()
302 *
303 * @param array $attrs array of attributes $name => $value
304 * @param string|null $tag
305 * @return boolean
306 */
307 protected function writeAttrs ($attrs, $tag = null)
308 {
309 if (is_array($attrs)) {
310 foreach ($attrs as $name => $value) {
311 $name = strtolower($name);
312
313 if (strpos($name, 'on') === 0) {
314 continue;
315 }
316
317 if (strpos($name, 'data') === 0) {
318 continue;
319 }
320
321 if ($tag != 'a' and in_array($name, $this->attributes)) {
322 continue;
323 }
324
325 if (!preg_match('/^[a-z0-9]+$/i', $name)) {
326 if (!in_array($name, $this->attributesNS)) {
327 continue;
328 }
329 }
330
331 if (($value === true) || (is_null($value))) {
332 $value = $name;
333 }
334
335 if ($name == 'style') {
336 // removes insignificant backslahes
337 $value = str_replace("\\", '', $value);
338
339 // removes CSS comments
340 while (1) {
341 $_value = preg_replace('!/\*.*?\*/!s', '', $value);
342
343 if ($_value == $value) {
344 break;
345 }
346
347 $value = $_value;
348 }
349
350 // replace all & to &amp;
351 $value = str_replace('&amp;', '&', $value);
352 $value = str_replace('&', '&amp;', $value);
353
354 foreach ($this->cssRegexps as $css) {
355 if (preg_match($css, $value)) {
356 continue 2;
357 }
358 }
359
360 foreach ($this->protoRegexps as $proto) {
361 if (preg_match($proto, $value)) {
362 continue 2;
363 }
364 }
365 }
366
367 $tempval = preg_replace_callback('/&#(\d+);?/m', function ($matches) { return chr($matches[1]); }, $value); //"'
368 $tempval = preg_replace_callback(
369 '/&#x([0-9a-f]+);?/mi',
370 function ($matches) { return chr(hexdec($matches[1])); },
371 $tempval
372 );
373
374 if ((in_array($name, $this->protocolAttributes))
375 && (strpos($tempval, ':') !== false)
376 ) {
377 if ($this->protocolFiltering == 'black') {
378 foreach ($this->protoRegexps as $proto) {
379 if (preg_match($proto, $tempval)) {
380 continue 2;
381 }
382 }
383 } else {
384 $_tempval = explode(':', $tempval);
385 $proto = $_tempval[0];
386
387 if (!in_array($proto, $this->whiteProtocols)) {
388 continue;
389 }
390 }
391 }
392
393 $value = str_replace("\"", '&quot;', $value);
394 $this->xhtml .= ' ' . $name . '="' . $value . '"';
395 }
396 }
397
398 return true;
399 }
400
401 /**
402 * Opening tag handler - called from HTMLSax
403 *
404 * @param object &$parser HTML Parser
405 * @param string $name tag name
406 * @param array $attrs tag attributes
407 *
408 * @return boolean
409 */
410 public function openHandler(&$parser, $name, $attrs)
411 {
412 $name = strtolower($name);
413
414 if (in_array($name, $this->deleteTagsContent)) {
415 array_push($this->dcStack, $name);
416 $this->dcCounter[$name] = isset($this->dcCounter[$name])
417 ? $this->dcCounter[$name]+1 : 1;
418 }
419 if (count($this->dcStack) != 0) {
420 return true;
421 }
422
423 if (in_array($name, $this->deleteTags)
424 && !in_array($name, $this->allowTags)
425 ) {
426 return true;
427 }
428
429 if (!preg_match('/^[a-z0-9]+$/i', $name)) {
430 if (preg_match('!(?:\@|://)!i', $name)) {
431 $this->xhtml .= '&lt;' . $name . '&gt;';
432 }
433 return true;
434 }
435
436 if (in_array($name, $this->singleTags)) {
437 $this->xhtml .= '<' . $name;
438 $this->writeAttrs($attrs, $name);
439 $this->xhtml .= ' />';
440 return true;
441 }
442
443 // TABLES: cannot open table elements when we are not inside table
444 if ((isset($this->counter['table']))
445 && ($this->counter['table'] <= 0)
446 && (in_array($name, $this->tableTags))
447 ) {
448 return true;
449 }
450
451 // PARAGRAPHS: close paragraph when closeParagraph tags opening
452 if ((in_array($name, $this->closeParagraph))
453 && (in_array('p', $this->stack))
454 ) {
455 $this->closeHandler($parser, 'p');
456 }
457
458 // LISTS: we should close <li> if <li> of the same level opening
459 if (($name == 'li') && count($this->liStack)
460 && ($this->listScope == $this->liStack[count($this->liStack) - 1])
461 ) {
462 $this->closeHandler($parser, 'li');
463 }
464
465 // LISTS: we want to know on what nesting level of lists we are
466 if (in_array($name, $this->listTags)) {
467 ++$this->listScope;
468 }
469
470 if ($name == 'li') {
471 array_push($this->liStack, $this->listScope);
472 }
473
474 $this->xhtml .= '<' . $name;
475 $this->writeAttrs($attrs, $name);
476 $this->xhtml .= '>';
477 array_push($this->stack,$name);
478 $this->counter[$name] = isset($this->counter[$name])
479 ? ($this->counter[$name] + 1) : 1;
480
481 return true;
482 }
483
484 /**
485 * Closing tag handler - called from HTMLSax
486 *
487 * @param object &$parser HTML parser
488 * @param string $name tag name
489 *
490 * @return boolean
491 */
492 public function closeHandler(&$parser, $name)
493 {
494 $name = strtolower($name);
495
496 if (isset($this->dcCounter[$name])
497 && ($this->dcCounter[$name] > 0)
498 && (in_array($name, $this->deleteTagsContent))
499 ) {
500 while ($name != ($tag = array_pop($this->dcStack))) {
501 --$this->dcCounter[$tag];
502 }
503
504 --$this->dcCounter[$name];
505 }
506
507 if (count($this->dcStack) != 0) {
508 return true;
509 }
510
511 if ((isset($this->counter[$name])) && ($this->counter[$name] > 0)) {
512 while ($name != ($tag = array_pop($this->stack))) {
513 $this->closeTag($tag);
514 }
515
516 $this->closeTag($name);
517 }
518 return true;
519 }
520
521 /**
522 * Closes tag
523 *
524 * @param string $tag tag name
525 *
526 * @return boolean
527 */
528 protected function closeTag($tag)
529 {
530 if (!in_array($tag, $this->noClose)) {
531 $this->xhtml .= '</' . $tag . '>';
532 }
533
534 --$this->counter[$tag];
535
536 if (in_array($tag, $this->listTags)) {
537 --$this->listScope;
538 }
539
540 if ($tag == 'li') {
541 array_pop($this->liStack);
542 }
543
544 return true;
545 }
546
547 /**
548 * Character data handler - called from HTMLSax
549 *
550 * @param object &$parser HTML parser
551 * @param string $data textual data
552 *
553 * @return boolean
554 */
555 public function dataHandler(&$parser, $data)
556 {
557 if (count($this->dcStack) == 0) {
558 $this->xhtml .= $data;
559 }
560
561 return true;
562 }
563
564 /**
565 * Escape handler - called from HTMLSax
566 *
567 * @param object &$parser HTML parser
568 * @param string $data comments or other type of data
569 *
570 * @return boolean
571 */
572 public function escapeHandler(&$parser, $data)
573 {
574 return true;
575 }
576
577 /**
578 * Allow tags
579 *
580 * Example:
581 * <pre>
582 * $safe = new HTML_Safe;
583 * $safe->setAllowTags(array('body'));
584 * </pre>
585 *
586 * @param array $tags Tags to allow
587 *
588 * @return void
589 */
590 public function setAllowTags($tags = array())
591 {
592 if (is_array($tags)) {
593 $this->allowTags = $tags;
594 }
595 }
596
597 /**
598 * Returns the allowed tags
599 *
600 * @return array
601 */
602 public function getAllowTags()
603 {
604 return $this->allowTags;
605 }
606
607 /**
608 * Reset the allowed tags
609 *
610 * @return void
611 */
612 public function resetAllowTags()
613 {
614 $this->allowTags = array();
615 }
616
617 /**
618 * Returns the XHTML document
619 *
620 * @return string Processed (X)HTML document
621 */
622 public function getXHTML()
623 {
624 while ($tag = array_pop($this->stack)) {
625 $this->closeTag($tag);
626 }
627
628 return $this->xhtml;
629 }
630
631 /**
632 * Clears current document data
633 *
634 * @return boolean
635 */
636 public function clear()
637 {
638 $this->xhtml = '';
639 return true;
640 }
641
642 /**
643 * Main parsing fuction
644 *
645 * @param string $doc HTML document for processing
646 *
647 * @return string Processed (X)HTML document
648 */
649 public function parse($doc)
650 {
651 $result = '';
652
653 // Save all '<' symbols
654 $doc = preg_replace('/<(?=[^a-zA-Z\/\!\?\%])/', '&lt;', $doc);
655
656 // Web documents shouldn't contains \x00 symbol
657 $doc = str_replace("\x00", '', $doc);
658
659 // Opera6 bug workaround
660 $doc = str_replace("\xC0\xBC", '&lt;', $doc);
661
662 if ($this->repackUTF7) {
663 // UTF-7 encoding ASCII decode
664 $doc = $this->repackUTF7($doc);
665 }
666
667 // Instantiate the parser
668 $parser = new XML_HTMLSax3();
669
670 // Set up the parser
671 $parser->set_object($this);
672
673 $parser->set_element_handler('openHandler', 'closeHandler');
674 $parser->set_data_handler('dataHandler');
675 $parser->set_escape_handler('escapeHandler');
676
677 $parser->parse($doc);
678
679 $result = $this->getXHTML();
680
681 $this->clear();
682
683 return $result;
684 }
685
686 /**
687 * UTF-7 decoding fuction
688 *
689 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
690 * @return string Decoded document
691 */
692 protected function repackUTF7($str)
693 {
694 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
695 }
696
697 /**
698 * Additional UTF-7 decoding fuction
699 *
700 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
701 * @return string Recoded string
702 */
703 protected function repackUTF7Callback($str)
704 {
705 $str = base64_decode($str[1]);
706 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
707 return preg_replace('/\x00(.)/', '$1', $str);
708 }
709
710 /**
711 * Additional UTF-7 encoding fuction
712 *
713 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
714 * @return string Recoded string
715 */
716 protected function repackUTF7Back($str)
717 {
718 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
719 }
720 }