9 * @author Roman Ivanov <thingol@mail.ru>
10 * @author Miguel Vazquez Gocobachi <demrit@mx.gnu.org>
11 * @copyright 2004-2020 Roman Ivanov, Miguel Vazquez Gocobachi, WackoWiki Team
12 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
14 * @link https://wackowiki.org/doc/Dev/Projects/SafeHTML
18 * This package requires HTMLSax3 package
20 require_once(XML_HTMLSAX3
. 'HTMLSax3.php');
25 * This parser strips down all potentially dangerous content within HTML:
27 * <li>opening tag without its closing tag</li>
28 * <li>closing tag without its opening tag</li>
29 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
30 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
31 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
32 * <li>any of these attributes: on*, data*, dynsrc</li>
33 * <li>javascript:/vbscript:/about: etc. protocols</li>
34 * <li>expression/behavior etc. in styles</li>
35 * <li>any other active content</li>
37 * It also tries to convert code to XHTML valid, but htmltidy is far better
38 * solution for this task.
42 * $parser = new SafeHTML;
43 * $result = $parser->parse($doc);
50 * Storage for resulting HTML output
54 protected $xhtml = '';
57 * Array of counters for each tag
61 protected $counter = [];
64 * Stack of unclosed tags
68 protected $stack = [];
71 * Array of counters for tags that must be deleted with all content
75 protected $dcCounter = [];
78 * Stack of unclosed tags that must be deleted with all content
82 protected $dcStack = [];
85 * Stores level of list (ol/ul) nesting
89 protected $listScope = 0;
92 * Stack of unclosed list tags
96 protected $liStack = [];
99 * Array of prepared regular expressions for protocols (schemas) matching
103 protected $protoRegexps = [];
106 * Array of prepared regular expressions for CSS matching
110 protected $cssRegexps = [];
117 protected $allowTags = [];
121 * List of single tags ("<tag>")
125 public $singleTags = ['area', 'br', 'img', 'input', 'hr', 'wbr', ];
128 * List of dangerous tags (such tags will be deleted)
132 public $deleteTags = [
133 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
134 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
135 'iframe', 'layer', 'link', 'meta', 'object', 'style',
140 * List of dangerous tags (such tags will be deleted, and all content
141 * inside this tags will be also removed)
145 public $deleteTagsContent = ['script', 'style', 'title', 'xml', ];
148 * Type of protocols filtering ('white' or 'black')
152 public $protocolFiltering = 'white';
155 * List of "dangerous" protocols (used for blacklist-filtering)
159 public $blackProtocols = [
160 'about', 'chrome', 'data', 'disk', 'hcp',
161 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
162 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
163 'res', 'resource', 'shell', 'vbscript', 'view-source',
164 'vnd.ms.radio', 'wysiwyg',
168 * List of "safe" protocols (used for whitelist-filtering)
172 public $whiteProtocols = [
173 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
174 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
179 * List of attributes that can contain protocols
183 public $protocolAttributes = [
184 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
188 * List of dangerous CSS keywords
190 * Whole style="" attribute will be removed, if parser will find one of
195 public $cssKeywords = [
196 'absolute', 'behavior', 'behaviour', 'content', 'expression',
197 'fixed', 'include-source', 'moz-binding',
201 * List of tags that can have no "closing tag"
204 * @deprecated XHTML does not allow such tags
206 public $noClose = [];
209 * List of block-level tags that terminates paragraph
211 * Paragraph will be closed when this tags opened
215 public $closeParagraph = [
216 'address', 'article', 'aside', 'blockquote', 'details', 'div',
217 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
218 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
219 'header', 'hgroup', 'hr', 'main', 'menu', 'nav',
220 'ol', 'p', 'pre', 'section', 'table', 'ul',
224 * List of table tags, all table tags outside a table will be removed
228 public $tableTags = [
229 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
238 public $listTags = ['menu', 'ol', 'ul', 'dl', ];
241 * List of dangerous attributes
245 public $attributes = ['dynsrc', 'id', 'name', ];
248 * List of allowed "namespaced" attributes
252 public $attributesNS = ['xml:lang', ];
259 public function __construct()
261 //making regular expressions based on Proto & CSS arrays
262 foreach ($this->blackProtocols
as $proto)
264 $preg = "/[\s\x01-\x1F]*";
266 for ($i = 0; $i < strlen($proto); $i++
)
268 $preg .= $proto[$i] . "[\s\x01-\x1F]*";
272 $this->protoRegexps
[] = $preg;
275 foreach ($this->cssKeywords
as $css)
277 $this->cssRegexps
[] = '/' . $css . '/i';
284 * Handles the writing of attributes - called from $this->openHandler()
286 * @param array $attrs array of attributes $name => $value
290 protected function writeAttrs($attrs)
292 if (is_array($attrs))
294 foreach ($attrs as $name => $value)
296 $name = strtolower($name);
298 if (strpos($name, 'on') === 0)
303 // MODIF SPIP : ne pas supprimer les attributs html5 data-xx
304 if (in_array($name, $this->attributes
))
309 // remove dataxx attributes but not the html5 data-xx one
310 if (strpos($name, 'data') === 0)
312 if (strpos($name, 'data-') !== 0 ||
(!preg_match('/^[a-z0-9-]+$/i', $name))) {
316 elseif (!preg_match('/^[a-z0-9]+$/i', $name))
318 if (!in_array($name, $this->attributesNS
))
325 if (($value === true) ||
(is_null($value)))
330 if ($name == 'style')
332 // removes insignificant backslahes
333 $value = str_replace("\\", '', $value);
335 // removes CSS comments
338 $_value = preg_replace('!/\*.*?\*/!s', '', $value);
340 if ($_value == $value)
348 // replace all & to &
349 $value = str_replace('&', '&', $value);
350 $value = str_replace('&', '&', $value);
352 foreach ($this->cssRegexps
as $css)
354 if (preg_match($css, $value))
360 foreach ($this->protoRegexps
as $proto)
362 if (preg_match($proto, $value))
369 $tempval = preg_replace_callback('/&#(\d+);?/m', function ($matches) { return chr($matches[1]); }, $value); //"'
370 $tempval = preg_replace_callback(
371 '/&#x([0-9a-f]+);?/mi',
372 function ($matches) { return chr(hexdec($matches[1])); },
376 if ((in_array($name, $this->protocolAttributes
))
377 && (strpos($tempval, ':') !== false)
380 if ($this->protocolFiltering
== 'black')
382 foreach ($this->protoRegexps
as $proto)
384 if (preg_match($proto, $tempval))
392 $_tempval = explode(':', $tempval);
393 $proto = $_tempval[0];
395 if (!in_array($proto, $this->whiteProtocols
))
402 $value = str_replace("\"", '"', $value);
403 $this->xhtml
.= ' ' . $name . '="' . $value . '"';
411 * Opening tag handler - called from HTMLSax
413 * @param object &$parser HTML Parser
414 * @param string $name tag name
415 * @param array $attrs tag attributes
419 public function openHandler(&$parser, $name, $attrs)
421 $name = strtolower($name);
423 if (in_array($name, $this->deleteTagsContent
))
425 array_push($this->dcStack
, $name);
426 $this->dcCounter
[$name] = isset($this->dcCounter
[$name])
427 ?
$this->dcCounter
[$name] +
1
431 if (count($this->dcStack
) != 0)
436 if (in_array($name, $this->deleteTags
)
437 && !in_array($name, $this->allowTags
)
443 if (!preg_match('/^[a-z0-9]+$/i', $name))
445 if (preg_match('!(?:\@|://)!i', $name))
447 $this->xhtml
.= '<' . $name . '>';
453 if (in_array($name, $this->singleTags
))
455 $this->xhtml
.= '<' . $name;
456 $this->writeAttrs($attrs);
457 $this->xhtml
.= ' />';
462 // TABLES: cannot open table elements when we are not inside table
463 if ((isset($this->counter
['table']))
464 && ($this->counter
['table'] <= 0)
465 && (in_array($name, $this->tableTags
))
471 // PARAGRAPHS: close paragraph when closeParagraph tags opening
472 if ((in_array($name, $this->closeParagraph
))
473 && (in_array('p', $this->stack
))
476 $this->closeHandler($parser, 'p');
479 // LISTS: we should close <li> if <li> of the same level opening
480 if (($name == 'li') && count($this->liStack
)
481 && ($this->listScope
== $this->liStack
[count($this->liStack
) - 1])
484 $this->closeHandler($parser, 'li');
487 // LISTS: we want to know on what nesting level of lists we are
488 if (in_array($name, $this->listTags
))
495 array_push($this->liStack
, $this->listScope
);
498 $this->xhtml
.= '<' . $name;
499 $this->writeAttrs($attrs);
501 array_push($this->stack
, $name);
502 $this->counter
[$name] = isset($this->counter
[$name])
503 ?
($this->counter
[$name] +
1)
510 * Closing tag handler - called from HTMLSax
512 * @param object &$parser HTML parser
513 * @param string $name tag name
517 public function closeHandler(&$parser, $name)
519 $name = strtolower($name);
521 if (isset($this->dcCounter
[$name])
522 && ($this->dcCounter
[$name] > 0)
523 && (in_array($name, $this->deleteTagsContent
))
526 while ($name != ($tag = array_pop($this->dcStack
)))
528 --$this->dcCounter
[$tag];
531 --$this->dcCounter
[$name];
534 if (count($this->dcStack
) != 0)
539 if ((isset($this->counter
[$name])) && ($this->counter
[$name] > 0))
541 while ($name != ($tag = array_pop($this->stack
)))
543 $this->closeTag($tag);
546 $this->closeTag($name);
555 * @param string $tag tag name
559 protected function closeTag($tag)
561 if (!in_array($tag, $this->noClose
))
563 $this->xhtml
.= '</' . $tag . '>';
566 --$this->counter
[$tag];
568 if (in_array($tag, $this->listTags
))
575 array_pop($this->liStack
);
582 * Character data handler - called from HTMLSax
584 * @param object &$parser HTML parser
585 * @param string $data textual data
589 public function dataHandler(&$parser, $data)
591 if (count($this->dcStack
) == 0)
593 $this->xhtml
.= $data;
600 * Escape handler - called from HTMLSax
602 * @param object &$parser HTML parser
603 * @param string $data comments or other type of data
607 public function escapeHandler(&$parser, $data)
617 * $safe = new SafeHTML;
618 * $safe->setAllowTags(['body']);
621 * @param array $tags Tags to allow
625 public function setAllowTags($tags = [])
629 $this->allowTags
= $tags;
634 * Returns the allowed tags
638 public function getAllowTags()
640 return $this->allowTags
;
644 * Reset the allowed tags
648 public function resetAllowTags()
650 $this->allowTags
= [];
654 * Returns the XHTML document
656 * @return string Processed (X)HTML document
658 public function getXHTML()
660 while ($tag = array_pop($this->stack
))
662 $this->closeTag($tag);
669 * Clears current document data
673 public function clear()
681 * Main parsing function
683 * @param string $doc HTML document for processing
685 * @return string Processed (X)HTML document
687 public function parse($doc)
691 // Save all '<' symbols
692 $doc = preg_replace('/<(?=[^a-zA-Z\/\!\?\%])/', '<', $doc);
695 $doc = $this->repackUTF7($doc);
697 // Instantiate the parser
698 $parser = new XML_HTMLSax3
;
701 $parser->set_object($this);
703 $parser->set_element_handler('openHandler', 'closeHandler');
704 $parser->set_data_handler('dataHandler');
705 $parser->set_escape_handler('escapeHandler');
707 $parser->parse($doc);
709 $result = $this->getXHTML();
717 * UTF-7 decoding function
719 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
720 * @return string Decoded document
723 function repackUTF7($str)
725 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', [$this, 'repackUTF7Callback'], $str);
729 * Additional UTF-7 decoding function
731 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
732 * @return string Recoded string
735 function repackUTF7Callback($str)
737 $str = base64_decode($str[1]);
738 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', [$this, 'repackUTF7Back'], $str);
740 return preg_replace('/\x00(.)/', '$1', $str);
744 * Additional UTF-7 encoding function
746 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
747 * @return string Recoded string
750 function repackUTF7Back($str)
752 return $str[1] . '+' . rtrim(base64_encode($str[2]), '=') . '-';