6959b1cdd621e0455662cc5d698d4634f0251af1
7 * Attention : Quelques modifications pour PHP 5.5 et 7
10 * @author Roman Ivanov <thingol@mail.ru>
11 * @author Miguel Vazquez Gocobachi <demrit@mx.gnu.org>
12 * @copyright 2004-2009 Roman Ivanov, Miguel Vazquez Gocobachi
13 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
15 * @link https://wackowiki.org/doc/Dev/Projects/SafeHTML
19 if (!defined('_ECRIRE_INC_VERSION')) return;
21 require_once(XML_HTMLSAX3
. 'HTMLSax3.php');
27 * This parser strips down all potentially dangerous content within HTML:
29 * <li>opening tag without its closing tag</li>
30 * <li>closing tag without its opening tag</li>
31 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
32 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
33 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
34 * <li>any of these attributes: on*, data*, dynsrc</li>
35 * <li>javascript:/vbscript:/about: etc. protocols</li>
36 * <li>expression/behavior etc. in styles</li>
37 * <li>any other active content</li>
39 * It also tries to convert code to XHTML valid, but htmltidy is far better
40 * solution for this task.
44 * $parser =& new SafeHTML();
45 * $result = $parser->parse($doc);
50 * @author Roman Ivanov <thingol@mail.ru>
51 * @copyright 1997-2005 Roman Ivanov
52 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
53 * @version Release: @package_version@
54 * @link http://pear.php.net/package/SafeHTML
60 * Storage for resulting HTML output
64 protected $xhtml = '';
67 * Array of counters for each tag
71 protected $counter = array();
74 * Stack of unclosed tags
78 protected $stack = array();
81 * Array of counters for tags that must be deleted with all content
85 protected $dcCounter = array();
88 * Stack of unclosed tags that must be deleted with all content
92 protected $dcStack = array();
95 * Stores level of list (ol/ul) nesting
99 protected $listScope = 0;
102 * Stack of unclosed list tags
106 protected $liStack = array();
109 * Array of prepared regular expressions for protocols (schemas) matching
113 protected $protoRegexps = array();
116 * Array of prepared regular expressions for CSS matching
120 protected $cssRegexps = array();
123 * Should we perform UTF7 repacking or not?
125 * This repacking might replace completely normal strings such as "+31-" by illegal sequences,
126 * which cause the document to be truncated on saving to MySQL
131 var $repackUTF7 = true;
138 protected $allowTags = array();
142 * List of single tags ("<tag />")
146 public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
149 * List of dangerous tags (such tags will be deleted)
153 public $deleteTags = array(
154 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
155 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
156 'iframe', 'layer', 'link', 'meta', 'object', 'style',
161 * List of dangerous tags (such tags will be deleted, and all content
162 * inside this tags will be also removed)
166 public $deleteTagsContent = array('script', 'style', 'title', 'xml', );
169 * Type of protocols filtering ('white' or 'black')
173 public $protocolFiltering = 'white';
176 * List of "dangerous" protocols (used for blacklist-filtering)
180 public $blackProtocols = array(
181 'about', 'chrome', 'data', 'disk', 'hcp',
182 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
183 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
184 'res', 'resource', 'shell', 'vbscript', 'view-source',
185 'vnd.ms.radio', 'wysiwyg',
189 * List of "safe" protocols (used for whitelist-filtering)
193 public $whiteProtocols = array(
194 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
195 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
200 * List of attributes that can contain protocols
204 public $protocolAttributes = array(
205 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', 'formaction',
209 * List of dangerous CSS keywords
211 * Whole style="" attribute will be removed, if parser will find one of
216 public $cssKeywords = array(
217 'absolute', 'behavior', 'behaviour', 'content', 'expression',
218 'fixed', 'include-source', 'moz-binding',
222 * List of tags that can have no "closing tag"
225 * @deprecated XHTML does not allow such tags
227 public $noClose = array();
230 * List of block-level tags that terminates paragraph
232 * Paragraph will be closed when this tags opened
236 public $closeParagraph = array(
237 'address', 'article', 'aside', 'audio', 'blockquote', 'canvas',
238 'center', 'dd', 'dir', 'div', 'dl', 'dt',
239 'figure', 'figcaption', 'footer', 'h1', 'h2', 'h3',
240 'h4', 'h5', 'h6', 'header', 'hr', 'isindex',
241 'listing', 'main', 'marquee', 'menu', 'multicol', 'nav',
242 'ol', 'output', 'p', 'plaintext', 'pre', 'section',
243 'table', 'ul', 'video', 'xmp',
247 * List of table tags, all table tags outside a table will be removed
251 public $tableTags = array(
252 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
261 public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
264 * List of dangerous attributes
268 public $attributes = array('dynsrc', 'id', 'name', );
271 * List of allowed "namespaced" attributes
275 public $attributesNS = array('xml:lang', );
282 public function __construct()
284 //making regular expressions based on Proto & CSS arrays
285 foreach ($this->blackProtocols
as $proto) {
286 $preg = "/[\s\x01-\x1F]*";
287 for ($i=0; $i<strlen($proto); $i++
) {
288 $preg .= $proto[$i] . "[\s\x01-\x1F]*";
291 $this->protoRegexps
[] = $preg;
294 foreach ($this->cssKeywords
as $css) {
295 $this->cssRegexps
[] = '/' . $css . '/i';
301 * Handles the writing of attributes - called from $this->openHandler()
303 * @param array $attrs array of attributes $name => $value
304 * @param string|null $tag
307 protected function writeAttrs ($attrs, $tag = null)
309 if (is_array($attrs)) {
310 foreach ($attrs as $name => $value) {
311 $name = strtolower($name);
313 if (strpos($name, 'on') === 0) {
317 if (strpos($name, 'data') === 0) {
321 if ($tag != 'a' and in_array($name, $this->attributes
)) {
325 if (!preg_match('/^[a-z0-9]+$/i', $name)) {
326 if (!in_array($name, $this->attributesNS
)) {
331 if (($value === true) ||
(is_null($value))) {
335 if ($name == 'style') {
336 // removes insignificant backslahes
337 $value = str_replace("\\", '', $value);
339 // removes CSS comments
341 $_value = preg_replace('!/\*.*?\*/!s', '', $value);
343 if ($_value == $value) {
350 // replace all & to &
351 $value = str_replace('&', '&', $value);
352 $value = str_replace('&', '&', $value);
354 foreach ($this->cssRegexps
as $css) {
355 if (preg_match($css, $value)) {
360 foreach ($this->protoRegexps
as $proto) {
361 if (preg_match($proto, $value)) {
367 $tempval = preg_replace_callback('/&#(\d+);?/m', function ($matches) { return chr($matches[1]); }, $value); //"'
368 $tempval = preg_replace_callback(
369 '/&#x([0-9a-f]+);?/mi',
370 function ($matches) { return chr(hexdec($matches[1])); },
374 if ((in_array($name, $this->protocolAttributes
))
375 && (strpos($tempval, ':') !== false)
377 if ($this->protocolFiltering
== 'black') {
378 foreach ($this->protoRegexps
as $proto) {
379 if (preg_match($proto, $tempval)) {
384 $_tempval = explode(':', $tempval);
385 $proto = $_tempval[0];
387 if (!in_array($proto, $this->whiteProtocols
)) {
393 $value = str_replace("\"", '"', $value);
394 $this->xhtml
.= ' ' . $name . '="' . $value . '"';
402 * Opening tag handler - called from HTMLSax
404 * @param object &$parser HTML Parser
405 * @param string $name tag name
406 * @param array $attrs tag attributes
410 public function openHandler(&$parser, $name, $attrs)
412 $name = strtolower($name);
414 if (in_array($name, $this->deleteTagsContent
)) {
415 array_push($this->dcStack
, $name);
416 $this->dcCounter
[$name] = isset($this->dcCounter
[$name])
417 ?
$this->dcCounter
[$name]+
1 : 1;
419 if (count($this->dcStack
) != 0) {
423 if (in_array($name, $this->deleteTags
)
424 && !in_array($name, $this->allowTags
)
429 if (!preg_match('/^[a-z0-9]+$/i', $name)) {
430 if (preg_match('!(?:\@|://)!i', $name)) {
431 $this->xhtml
.= '<' . $name . '>';
436 if (in_array($name, $this->singleTags
)) {
437 $this->xhtml
.= '<' . $name;
438 $this->writeAttrs($attrs, $name);
439 $this->xhtml
.= ' />';
443 // TABLES: cannot open table elements when we are not inside table
444 if ((isset($this->counter
['table']))
445 && ($this->counter
['table'] <= 0)
446 && (in_array($name, $this->tableTags
))
451 // PARAGRAPHS: close paragraph when closeParagraph tags opening
452 if ((in_array($name, $this->closeParagraph
))
453 && (in_array('p', $this->stack
))
455 $this->closeHandler($parser, 'p');
458 // LISTS: we should close <li> if <li> of the same level opening
459 if (($name == 'li') && count($this->liStack
)
460 && ($this->listScope
== $this->liStack
[count($this->liStack
) - 1])
462 $this->closeHandler($parser, 'li');
465 // LISTS: we want to know on what nesting level of lists we are
466 if (in_array($name, $this->listTags
)) {
471 array_push($this->liStack
, $this->listScope
);
474 $this->xhtml
.= '<' . $name;
475 $this->writeAttrs($attrs, $name);
477 array_push($this->stack
,$name);
478 $this->counter
[$name] = isset($this->counter
[$name])
479 ?
($this->counter
[$name] +
1) : 1;
485 * Closing tag handler - called from HTMLSax
487 * @param object &$parser HTML parser
488 * @param string $name tag name
492 public function closeHandler(&$parser, $name)
494 $name = strtolower($name);
496 if (isset($this->dcCounter
[$name])
497 && ($this->dcCounter
[$name] > 0)
498 && (in_array($name, $this->deleteTagsContent
))
500 while ($name != ($tag = array_pop($this->dcStack
))) {
501 --$this->dcCounter
[$tag];
504 --$this->dcCounter
[$name];
507 if (count($this->dcStack
) != 0) {
511 if ((isset($this->counter
[$name])) && ($this->counter
[$name] > 0)) {
512 while ($name != ($tag = array_pop($this->stack
))) {
513 $this->closeTag($tag);
516 $this->closeTag($name);
524 * @param string $tag tag name
528 protected function closeTag($tag)
530 if (!in_array($tag, $this->noClose
)) {
531 $this->xhtml
.= '</' . $tag . '>';
534 --$this->counter
[$tag];
536 if (in_array($tag, $this->listTags
)) {
541 array_pop($this->liStack
);
548 * Character data handler - called from HTMLSax
550 * @param object &$parser HTML parser
551 * @param string $data textual data
555 public function dataHandler(&$parser, $data)
557 if (count($this->dcStack
) == 0) {
558 $this->xhtml
.= $data;
565 * Escape handler - called from HTMLSax
567 * @param object &$parser HTML parser
568 * @param string $data comments or other type of data
572 public function escapeHandler(&$parser, $data)
582 * $safe = new HTML_Safe;
583 * $safe->setAllowTags(array('body'));
586 * @param array $tags Tags to allow
590 public function setAllowTags($tags = array())
592 if (is_array($tags)) {
593 $this->allowTags
= $tags;
598 * Returns the allowed tags
602 public function getAllowTags()
604 return $this->allowTags
;
608 * Reset the allowed tags
612 public function resetAllowTags()
614 $this->allowTags
= array();
618 * Returns the XHTML document
620 * @return string Processed (X)HTML document
622 public function getXHTML()
624 while ($tag = array_pop($this->stack
)) {
625 $this->closeTag($tag);
632 * Clears current document data
636 public function clear()
643 * Main parsing fuction
645 * @param string $doc HTML document for processing
647 * @return string Processed (X)HTML document
649 public function parse($doc)
653 // Save all '<' symbols
654 $doc = preg_replace('/<(?=[^a-zA-Z\/\!\?\%])/', '<', $doc);
656 // Web documents shouldn't contains \x00 symbol
657 $doc = str_replace("\x00", '', $doc);
659 // Opera6 bug workaround
660 $doc = str_replace("\xC0\xBC", '<', $doc);
662 if ($this->repackUTF7
) {
663 // UTF-7 encoding ASCII decode
664 $doc = $this->repackUTF7($doc);
667 // Instantiate the parser
668 $parser = new XML_HTMLSax3();
671 $parser->set_object($this);
673 $parser->set_element_handler('openHandler', 'closeHandler');
674 $parser->set_data_handler('dataHandler');
675 $parser->set_escape_handler('escapeHandler');
677 $parser->parse($doc);
679 $result = $this->getXHTML();
687 * UTF-7 decoding fuction
689 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
690 * @return string Decoded document
692 protected function repackUTF7($str)
694 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
698 * Additional UTF-7 decoding fuction
700 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
701 * @return string Recoded string
703 protected function repackUTF7Callback($str)
705 $str = base64_decode($str[1]);
706 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
707 return preg_replace('/\x00(.)/', '$1', $str);
711 * Additional UTF-7 encoding fuction
713 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
714 * @return string Recoded string
716 protected function repackUTF7Back($str)
718 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';