[SPIP] v3.2.11 -> v3.2.12
[lhc/web/www.git] / www / plugins-dist / safehtml / lib / safehtml / classes / safehtml.php
1 <?php
2 /**
3 * SafeHTML Parser
4 *
5 * PHP version 7
6 *
7 * @category HTML
8 * @package SafeHTML
9 * @author Roman Ivanov <thingol@mail.ru>
10 * @author Miguel Vazquez Gocobachi <demrit@mx.gnu.org>
11 * @copyright 2004-2020 Roman Ivanov, Miguel Vazquez Gocobachi, WackoWiki Team
12 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
13 * @version 1.3.12
14 * @link https://wackowiki.org/doc/Dev/Projects/SafeHTML
15 */
16
17 /**
18 * This package requires HTMLSax3 package
19 */
20 require_once(XML_HTMLSAX3 . 'HTMLSax3.php');
21
22 /**
23 * HTML_Safe Parser
24 *
25 * This parser strips down all potentially dangerous content within HTML:
26 * <ul>
27 * <li>opening tag without its closing tag</li>
28 * <li>closing tag without its opening tag</li>
29 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
30 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
31 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
32 * <li>any of these attributes: on*, data*, dynsrc</li>
33 * <li>javascript:/vbscript:/about: etc. protocols</li>
34 * <li>expression/behavior etc. in styles</li>
35 * <li>any other active content</li>
36 * </ul>
37 * It also tries to convert code to XHTML valid, but htmltidy is far better
38 * solution for this task.
39 *
40 * <b>Example:</b>
41 * <pre>
42 * $parser = new SafeHTML;
43 * $result = $parser->parse($doc);
44 * </pre>
45 */
46
47 class SafeHTML
48 {
49 /**
50 * Storage for resulting HTML output
51 *
52 * @var string
53 */
54 protected $xhtml = '';
55
56 /**
57 * Array of counters for each tag
58 *
59 * @var array
60 */
61 protected $counter = [];
62
63 /**
64 * Stack of unclosed tags
65 *
66 * @var array
67 */
68 protected $stack = [];
69
70 /**
71 * Array of counters for tags that must be deleted with all content
72 *
73 * @var array
74 */
75 protected $dcCounter = [];
76
77 /**
78 * Stack of unclosed tags that must be deleted with all content
79 *
80 * @var array
81 */
82 protected $dcStack = [];
83
84 /**
85 * Stores level of list (ol/ul) nesting
86 *
87 * @var int
88 */
89 protected $listScope = 0;
90
91 /**
92 * Stack of unclosed list tags
93 *
94 * @var array
95 */
96 protected $liStack = [];
97
98 /**
99 * Array of prepared regular expressions for protocols (schemas) matching
100 *
101 * @var array
102 */
103 protected $protoRegexps = [];
104
105 /**
106 * Array of prepared regular expressions for CSS matching
107 *
108 * @var array
109 */
110 protected $cssRegexps = [];
111
112 /**
113 * Allowed tags
114 *
115 * @var array
116 */
117 protected $allowTags = [];
118
119
120 /**
121 * List of single tags ("<tag>")
122 *
123 * @var array
124 */
125 public $singleTags = ['area', 'br', 'img', 'input', 'hr', 'wbr', ];
126
127 /**
128 * List of dangerous tags (such tags will be deleted)
129 *
130 * @var array
131 */
132 public $deleteTags = [
133 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
134 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
135 'iframe', 'layer', 'link', 'meta', 'object', 'style',
136 'title', 'script',
137 ];
138
139 /**
140 * List of dangerous tags (such tags will be deleted, and all content
141 * inside this tags will be also removed)
142 *
143 * @var array
144 */
145 public $deleteTagsContent = ['script', 'style', 'title', 'xml', ];
146
147 /**
148 * Type of protocols filtering ('white' or 'black')
149 *
150 * @var string
151 */
152 public $protocolFiltering = 'white';
153
154 /**
155 * List of "dangerous" protocols (used for blacklist-filtering)
156 *
157 * @var array
158 */
159 public $blackProtocols = [
160 'about', 'chrome', 'data', 'disk', 'hcp',
161 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
162 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
163 'res', 'resource', 'shell', 'vbscript', 'view-source',
164 'vnd.ms.radio', 'wysiwyg',
165 ];
166
167 /**
168 * List of "safe" protocols (used for whitelist-filtering)
169 *
170 * @var array
171 */
172 public $whiteProtocols = [
173 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
174 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
175 'xmpp', 'callto',
176 ];
177
178 /**
179 * List of attributes that can contain protocols
180 *
181 * @var array
182 */
183 public $protocolAttributes = [
184 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
185 ];
186
187 /**
188 * List of dangerous CSS keywords
189 *
190 * Whole style="" attribute will be removed, if parser will find one of
191 * these keywords
192 *
193 * @var array
194 */
195 public $cssKeywords = [
196 'absolute', 'behavior', 'behaviour', 'content', 'expression',
197 'fixed', 'include-source', 'moz-binding',
198 ];
199
200 /**
201 * List of tags that can have no "closing tag"
202 *
203 * @var array
204 * @deprecated XHTML does not allow such tags
205 */
206 public $noClose = [];
207
208 /**
209 * List of block-level tags that terminates paragraph
210 *
211 * Paragraph will be closed when this tags opened
212 *
213 * @var array
214 */
215 public $closeParagraph = [
216 'address', 'article', 'aside', 'blockquote', 'details', 'div',
217 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
218 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
219 'header', 'hgroup', 'hr', 'main', 'menu', 'nav',
220 'ol', 'p', 'pre', 'section', 'table', 'ul',
221 ];
222
223 /**
224 * List of table tags, all table tags outside a table will be removed
225 *
226 * @var array
227 */
228 public $tableTags = [
229 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
230 'thead', 'tr',
231 ];
232
233 /**
234 * List of list tags
235 *
236 * @var array
237 */
238 public $listTags = ['menu', 'ol', 'ul', 'dl', ];
239
240 /**
241 * List of dangerous attributes
242 *
243 * @var array
244 */
245 public $attributes = ['dynsrc', 'id', 'name', ];
246
247 /**
248 * List of allowed "namespaced" attributes
249 *
250 * @var array
251 */
252 public $attributesNS = ['xml:lang', ];
253
254 /**
255 * Constructs class
256 *
257 * @access public
258 */
259 public function __construct()
260 {
261 //making regular expressions based on Proto & CSS arrays
262 foreach ($this->blackProtocols as $proto)
263 {
264 $preg = "/[\s\x01-\x1F]*";
265
266 for ($i = 0; $i < strlen($proto); $i++)
267 {
268 $preg .= $proto[$i] . "[\s\x01-\x1F]*";
269 }
270
271 $preg .= ":/i";
272 $this->protoRegexps[] = $preg;
273 }
274
275 foreach ($this->cssKeywords as $css)
276 {
277 $this->cssRegexps[] = '/' . $css . '/i';
278 }
279
280 return true;
281 }
282
283 /**
284 * Handles the writing of attributes - called from $this->openHandler()
285 *
286 * @param array $attrs array of attributes $name => $value
287 *
288 * @return boolean
289 */
290 protected function writeAttrs($attrs)
291 {
292 if (is_array($attrs))
293 {
294 foreach ($attrs as $name => $value)
295 {
296 $name = strtolower($name);
297
298 if (strpos($name, 'on') === 0)
299 {
300 continue;
301 }
302
303 // MODIF SPIP : ne pas supprimer les attributs html5 data-xx
304 if (in_array($name, $this->attributes))
305 {
306 continue;
307 }
308
309 // remove dataxx attributes but not the html5 data-xx one
310 if (strpos($name, 'data') === 0)
311 {
312 if (strpos($name, 'data-') !== 0 || (!preg_match('/^[a-z0-9-]+$/i', $name))) {
313 continue;
314 }
315 }
316 elseif (!preg_match('/^[a-z0-9]+$/i', $name))
317 {
318 if (!in_array($name, $this->attributesNS))
319 {
320 continue;
321 }
322 }
323 // FIN MODIF SPIP
324
325 if (($value === true) || (is_null($value)))
326 {
327 $value = $name;
328 }
329
330 if ($name == 'style')
331 {
332 // removes insignificant backslahes
333 $value = str_replace("\\", '', $value);
334
335 // removes CSS comments
336 while (1)
337 {
338 $_value = preg_replace('!/\*.*?\*/!s', '', $value);
339
340 if ($_value == $value)
341 {
342 break;
343 }
344
345 $value = $_value;
346 }
347
348 // replace all & to &amp;
349 $value = str_replace('&amp;', '&', $value);
350 $value = str_replace('&', '&amp;', $value);
351
352 foreach ($this->cssRegexps as $css)
353 {
354 if (preg_match($css, $value))
355 {
356 continue 2;
357 }
358 }
359
360 foreach ($this->protoRegexps as $proto)
361 {
362 if (preg_match($proto, $value))
363 {
364 continue 2;
365 }
366 }
367 }
368
369 $tempval = preg_replace_callback('/&#(\d+);?/m', function ($matches) { return chr($matches[1]); }, $value); //"'
370 $tempval = preg_replace_callback(
371 '/&#x([0-9a-f]+);?/mi',
372 function ($matches) { return chr(hexdec($matches[1])); },
373 $tempval
374 );
375
376 if ((in_array($name, $this->protocolAttributes))
377 && (strpos($tempval, ':') !== false)
378 )
379 {
380 if ($this->protocolFiltering == 'black')
381 {
382 foreach ($this->protoRegexps as $proto)
383 {
384 if (preg_match($proto, $tempval))
385 {
386 continue 2;
387 }
388 }
389 }
390 else
391 {
392 $_tempval = explode(':', $tempval);
393 $proto = $_tempval[0];
394
395 if (!in_array($proto, $this->whiteProtocols))
396 {
397 continue;
398 }
399 }
400 }
401
402 $value = str_replace("\"", '&quot;', $value);
403 $this->xhtml .= ' ' . $name . '="' . $value . '"';
404 }
405 }
406
407 return true;
408 }
409
410 /**
411 * Opening tag handler - called from HTMLSax
412 *
413 * @param object &$parser HTML Parser
414 * @param string $name tag name
415 * @param array $attrs tag attributes
416 *
417 * @return boolean
418 */
419 public function openHandler(&$parser, $name, $attrs)
420 {
421 $name = strtolower($name);
422
423 if (in_array($name, $this->deleteTagsContent))
424 {
425 array_push($this->dcStack, $name);
426 $this->dcCounter[$name] = isset($this->dcCounter[$name])
427 ? $this->dcCounter[$name] + 1
428 : 1;
429 }
430
431 if (count($this->dcStack) != 0)
432 {
433 return true;
434 }
435
436 if (in_array($name, $this->deleteTags)
437 && !in_array($name, $this->allowTags)
438 )
439 {
440 return true;
441 }
442
443 if (!preg_match('/^[a-z0-9]+$/i', $name))
444 {
445 if (preg_match('!(?:\@|://)!i', $name))
446 {
447 $this->xhtml .= '&lt;' . $name . '&gt;';
448 }
449
450 return true;
451 }
452
453 if (in_array($name, $this->singleTags))
454 {
455 $this->xhtml .= '<' . $name;
456 $this->writeAttrs($attrs);
457 $this->xhtml .= ' />';
458
459 return true;
460 }
461
462 // TABLES: cannot open table elements when we are not inside table
463 if ((isset($this->counter['table']))
464 && ($this->counter['table'] <= 0)
465 && (in_array($name, $this->tableTags))
466 )
467 {
468 return true;
469 }
470
471 // PARAGRAPHS: close paragraph when closeParagraph tags opening
472 if ((in_array($name, $this->closeParagraph))
473 && (in_array('p', $this->stack))
474 )
475 {
476 $this->closeHandler($parser, 'p');
477 }
478
479 // LISTS: we should close <li> if <li> of the same level opening
480 if (($name == 'li') && count($this->liStack)
481 && ($this->listScope == $this->liStack[count($this->liStack) - 1])
482 )
483 {
484 $this->closeHandler($parser, 'li');
485 }
486
487 // LISTS: we want to know on what nesting level of lists we are
488 if (in_array($name, $this->listTags))
489 {
490 ++$this->listScope;
491 }
492
493 if ($name == 'li')
494 {
495 array_push($this->liStack, $this->listScope);
496 }
497
498 $this->xhtml .= '<' . $name;
499 $this->writeAttrs($attrs);
500 $this->xhtml .= '>';
501 array_push($this->stack, $name);
502 $this->counter[$name] = isset($this->counter[$name])
503 ? ($this->counter[$name] + 1)
504 : 1;
505
506 return true;
507 }
508
509 /**
510 * Closing tag handler - called from HTMLSax
511 *
512 * @param object &$parser HTML parser
513 * @param string $name tag name
514 *
515 * @return boolean
516 */
517 public function closeHandler(&$parser, $name)
518 {
519 $name = strtolower($name);
520
521 if (isset($this->dcCounter[$name])
522 && ($this->dcCounter[$name] > 0)
523 && (in_array($name, $this->deleteTagsContent))
524 )
525 {
526 while ($name != ($tag = array_pop($this->dcStack)))
527 {
528 --$this->dcCounter[$tag];
529 }
530
531 --$this->dcCounter[$name];
532 }
533
534 if (count($this->dcStack) != 0)
535 {
536 return true;
537 }
538
539 if ((isset($this->counter[$name])) && ($this->counter[$name] > 0))
540 {
541 while ($name != ($tag = array_pop($this->stack)))
542 {
543 $this->closeTag($tag);
544 }
545
546 $this->closeTag($name);
547 }
548
549 return true;
550 }
551
552 /**
553 * Closes tag
554 *
555 * @param string $tag tag name
556 *
557 * @return boolean
558 */
559 protected function closeTag($tag)
560 {
561 if (!in_array($tag, $this->noClose))
562 {
563 $this->xhtml .= '</' . $tag . '>';
564 }
565
566 --$this->counter[$tag];
567
568 if (in_array($tag, $this->listTags))
569 {
570 --$this->listScope;
571 }
572
573 if ($tag == 'li')
574 {
575 array_pop($this->liStack);
576 }
577
578 return true;
579 }
580
581 /**
582 * Character data handler - called from HTMLSax
583 *
584 * @param object &$parser HTML parser
585 * @param string $data textual data
586 *
587 * @return boolean
588 */
589 public function dataHandler(&$parser, $data)
590 {
591 if (count($this->dcStack) == 0)
592 {
593 $this->xhtml .= $data;
594 }
595
596 return true;
597 }
598
599 /**
600 * Escape handler - called from HTMLSax
601 *
602 * @param object &$parser HTML parser
603 * @param string $data comments or other type of data
604 *
605 * @return boolean
606 */
607 public function escapeHandler(&$parser, $data)
608 {
609 return true;
610 }
611
612 /**
613 * Allow tags
614 *
615 * Example:
616 * <pre>
617 * $safe = new SafeHTML;
618 * $safe->setAllowTags(['body']);
619 * </pre>
620 *
621 * @param array $tags Tags to allow
622 *
623 * @return void
624 */
625 public function setAllowTags($tags = [])
626 {
627 if (is_array($tags))
628 {
629 $this->allowTags = $tags;
630 }
631 }
632
633 /**
634 * Returns the allowed tags
635 *
636 * @return array
637 */
638 public function getAllowTags()
639 {
640 return $this->allowTags;
641 }
642
643 /**
644 * Reset the allowed tags
645 *
646 * @return void
647 */
648 public function resetAllowTags()
649 {
650 $this->allowTags = [];
651 }
652
653 /**
654 * Returns the XHTML document
655 *
656 * @return string Processed (X)HTML document
657 */
658 public function getXHTML()
659 {
660 while ($tag = array_pop($this->stack))
661 {
662 $this->closeTag($tag);
663 }
664
665 return $this->xhtml;
666 }
667
668 /**
669 * Clears current document data
670 *
671 * @return boolean
672 */
673 public function clear()
674 {
675 $this->xhtml = '';
676
677 return true;
678 }
679
680 /**
681 * Main parsing function
682 *
683 * @param string $doc HTML document for processing
684 *
685 * @return string Processed (X)HTML document
686 */
687 public function parse($doc)
688 {
689 $result = '';
690
691 // Save all '<' symbols
692 $doc = preg_replace('/<(?=[^a-zA-Z\/\!\?\%])/', '&lt;', $doc);
693
694 // UTF7 pack
695 $doc = $this->repackUTF7($doc);
696
697 // Instantiate the parser
698 $parser = new XML_HTMLSax3;
699
700 // Set up the parser
701 $parser->set_object($this);
702
703 $parser->set_element_handler('openHandler', 'closeHandler');
704 $parser->set_data_handler('dataHandler');
705 $parser->set_escape_handler('escapeHandler');
706
707 $parser->parse($doc);
708
709 $result = $this->getXHTML();
710
711 $this->clear();
712
713 return $result;
714 }
715
716 /**
717 * UTF-7 decoding function
718 *
719 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
720 * @return string Decoded document
721 * @access private
722 */
723 function repackUTF7($str)
724 {
725 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', [$this, 'repackUTF7Callback'], $str);
726 }
727
728 /**
729 * Additional UTF-7 decoding function
730 *
731 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
732 * @return string Recoded string
733 * @access private
734 */
735 function repackUTF7Callback($str)
736 {
737 $str = base64_decode($str[1]);
738 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', [$this, 'repackUTF7Back'], $str);
739
740 return preg_replace('/\x00(.)/', '$1', $str);
741 }
742
743 /**
744 * Additional UTF-7 encoding function
745 *
746 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
747 * @return string Recoded string
748 * @access private
749 */
750 function repackUTF7Back($str)
751 {
752 return $str[1] . '+' . rtrim(base64_encode($str[2]), '=') . '-';
753 }
754 }
755