7 * Attention : Quelques modifications pour PHP 5.5 et 7
10 * @author Roman Ivanov <thingol@mail.ru>
11 * @copyright 2004-2005 Roman Ivanov
12 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
14 * @link http://pixel-apes.com/safehtml/
18 if (!defined('_ECRIRE_INC_VERSION')) return;
20 require_once(XML_HTMLSAX3
. 'HTMLSax3.php');
26 * This parser strips down all potentially dangerous content within HTML:
28 * <li>opening tag without its closing tag</li>
29 * <li>closing tag without its opening tag</li>
30 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
31 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
32 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
33 * <li>any of these attributes: on*, data*, dynsrc</li>
34 * <li>javascript:/vbscript:/about: etc. protocols</li>
35 * <li>expression/behavior etc. in styles</li>
36 * <li>any other active content</li>
38 * It also tries to convert code to XHTML valid, but htmltidy is far better
39 * solution for this task.
43 * $parser =& new SafeHTML();
44 * $result = $parser->parse($doc);
49 * @author Roman Ivanov <thingol@mail.ru>
50 * @copyright 1997-2005 Roman Ivanov
51 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
52 * @version Release: @package_version@
53 * @link http://pear.php.net/package/SafeHTML
58 * Storage for resulting HTML output
66 * Array of counters for each tag
71 var $_counter = array();
74 * Stack of unclosed tags
79 var $_stack = array();
82 * Array of counters for tags that must be deleted with all content
87 var $_dcCounter = array();
90 * Stack of unclosed tags that must be deleted with all content
95 var $_dcStack = array();
98 * Stores level of list (ol/ul) nesting
106 * Stack of unclosed list tags
111 var $_liStack = array();
114 * Array of prepared regular expressions for protocols (schemas) matching
119 var $_protoRegexps = array();
122 * Array of prepared regular expressions for CSS matching
127 var $_cssRegexps = array();
130 * Should we perform UTF7 repacking or not?
132 * This repacking might replace completely normal strings such as "+31-" by illegal sequences,
133 * which cause the document to be truncated on saving to MySQL
138 var $repackUTF7 = true;
141 * List of single tags ("<tag />")
146 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
149 * List of dangerous tags (such tags will be deleted)
154 var $deleteTags = array(
155 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
156 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
157 'iframe', 'layer', 'link', 'meta', 'object', 'style',
162 * List of dangerous tags (such tags will be deleted, and all content
163 * inside this tags will be also removed)
168 var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
171 * Type of protocols filtering ('white' or 'black')
176 var $protocolFiltering = 'white';
179 * List of "dangerous" protocols (used for blacklist-filtering)
184 var $blackProtocols = array(
185 'about', 'chrome', 'data', 'disk', 'hcp',
186 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
187 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
188 'res', 'resource', 'shell', 'vbscript', 'view-source',
189 'vnd.ms.radio', 'wysiwyg',
193 * List of "safe" protocols (used for whitelist-filtering)
198 var $whiteProtocols = array(
199 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
200 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
205 * List of attributes that can contain protocols
210 var $protocolAttributes = array(
211 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', 'formaction',
215 * List of dangerous CSS keywords
217 * Whole style="" attribute will be removed, if parser will find one of
223 var $cssKeywords = array(
224 'absolute', 'behavior', 'behaviour', 'content', 'expression',
225 'fixed', 'include-source', 'moz-binding',
229 * List of tags that can have no "closing tag"
233 * @deprecated XHTML does not allow such tags
235 var $noClose = array();
238 * List of block-level tags that terminates paragraph
240 * Paragraph will be closed when this tags opened
245 var $closeParagraph = array(
246 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
247 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
248 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
249 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
250 'table', 'ul', 'xmp',
254 * List of table tags, all table tags outside a table will be removed
259 var $tableTags = array(
260 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
270 var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
273 * List of dangerous attributes
278 var $attributes = array('dynsrc', 'id', 'name', );
281 * List of allowed "namespaced" attributes
286 var $attributesNS = array('xml:lang', );
293 function __contruct()
295 //making regular expressions based on Proto & CSS arrays
296 foreach ($this->blackProtocols
as $proto) {
297 $preg = "/[\s\x01-\x1F]*";
298 for ($i=0; $i<strlen($proto); $i++
) {
299 $preg .= $proto{$i} . "[\s\x01-\x1F]*";
302 $this->_protoRegexps
[] = $preg;
305 foreach ($this->cssKeywords
as $css) {
306 $this->_cssRegexps
[] = '/' . $css . '/i';
312 * Handles the writing of attributes - called from $this->_openHandler()
314 * @param array $attrs array of attributes $name => $value
318 function _writeAttrs ($attrs, $tag = null)
320 if (is_array($attrs)) {
321 foreach ($attrs as $name => $value) {
323 $name = strtolower($name);
325 if (strpos($name, 'on') === 0) {
328 if (strpos($name, 'data') === 0) {
331 if ($tag != 'a' and in_array($name, $this->attributes
)) {
334 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
335 if (!in_array($name, $this->attributesNS
))
341 if (($value === TRUE) ||
(is_null($value))) {
345 if ($name == 'style') {
347 // removes insignificant backslahes
348 $value = str_replace("\\", '', $value);
350 // removes CSS comments
353 $_value = preg_replace("!/\*.*?\*/!s", '', $value);
354 if ($_value == $value) break;
358 // replace all & to &
359 $value = str_replace('&', '&', $value);
360 $value = str_replace('&', '&', $value);
362 foreach ($this->_cssRegexps
as $css) {
363 if (preg_match($css, $value)) {
367 foreach ($this->_protoRegexps
as $proto) {
368 if (preg_match($proto, $value)) {
374 $tempval = preg_replace_callback('/&#(\d+);?/m', create_function('$m', 'return chr($m[1]);'), $value); //"'
375 $tempval = preg_replace_callback('/&#x([0-9a-f]+);?/mi', create_function('$m', 'return chr(hexdec($m[1]));'), $tempval); //"'
377 if ((in_array($name, $this->protocolAttributes
)) &&
378 (strpos($tempval, ':') !== false))
380 if ($this->protocolFiltering
== 'black') {
381 foreach ($this->_protoRegexps
as $proto) {
382 if (preg_match($proto, $tempval)) continue 2;
385 $_tempval = explode(':', $tempval);
386 $proto = $_tempval[0];
387 if (!in_array($proto, $this->whiteProtocols
)) {
393 $value = str_replace("\"", """, $value);
394 $this->_xhtml
.= ' ' . $name . '="' . $value . '"';
401 * Opening tag handler - called from HTMLSax
403 * @param object $parser HTML Parser
404 * @param string $name tag name
405 * @param array $attrs tag attributes
409 function _openHandler(&$parser, $name, $attrs)
411 $name = strtolower($name);
413 if (in_array($name, $this->deleteTagsContent
)) {
414 array_push($this->_dcStack
, $name);
415 $this->_dcCounter
[$name] = isset($this->_dcCounter
[$name]) ?
$this->_dcCounter
[$name]+
1 : 1;
417 if (count($this->_dcStack
) != 0) {
421 if (in_array($name, $this->deleteTags
)) {
425 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
426 if (preg_match("!(?:\@|://)!i", $name)) {
427 $this->_xhtml
.= '<' . $name . '>';
432 if (in_array($name, $this->singleTags
)) {
433 $this->_xhtml
.= '<' . $name;
434 $this->_writeAttrs($attrs, $name);
435 $this->_xhtml
.= ' />';
439 // TABLES: cannot open table elements when we are not inside table
440 if ((isset($this->_counter
['table'])) && ($this->_counter
['table'] <= 0)
441 && (in_array($name, $this->tableTags
)))
446 // PARAGRAPHS: close paragraph when closeParagraph tags opening
447 if ((in_array($name, $this->closeParagraph
)) && (in_array('p', $this->_stack
))) {
448 $this->_closeHandler($parser, 'p');
451 // LISTS: we should close <li> if <li> of the same level opening
452 if ($name == 'li' && count($this->_liStack
) &&
453 $this->_listScope
== $this->_liStack
[count($this->_liStack
)-1])
455 $this->_closeHandler($parser, 'li');
458 // LISTS: we want to know on what nesting level of lists we are
459 if (in_array($name, $this->listTags
)) {
463 array_push($this->_liStack
, $this->_listScope
);
466 $this->_xhtml
.= '<' . $name;
467 $this->_writeAttrs($attrs, $name);
468 $this->_xhtml
.= '>';
469 array_push($this->_stack
,$name);
470 $this->_counter
[$name] = isset($this->_counter
[$name]) ?
$this->_counter
[$name]+
1 : 1;
475 * Closing tag handler - called from HTMLSax
477 * @param object $parsers HTML parser
478 * @param string $name tag name
482 function _closeHandler(&$parser, $name)
485 $name = strtolower($name);
487 if (isset($this->_dcCounter
[$name]) && ($this->_dcCounter
[$name] > 0) &&
488 (in_array($name, $this->deleteTagsContent
)))
490 while ($name != ($tag = array_pop($this->_dcStack
))) {
491 $this->_dcCounter
[$tag]--;
494 $this->_dcCounter
[$name]--;
497 if (count($this->_dcStack
) != 0) {
501 if ((isset($this->_counter
[$name])) && ($this->_counter
[$name] > 0)) {
502 while ($name != ($tag = array_pop($this->_stack
))) {
503 $this->_closeTag($tag);
506 $this->_closeTag($name);
514 * @param string $tag tag name
518 function _closeTag($tag)
520 if (!in_array($tag, $this->noClose
)) {
521 $this->_xhtml
.= '</' . $tag . '>';
524 $this->_counter
[$tag]--;
526 if (in_array($tag, $this->listTags
)) {
531 array_pop($this->_liStack
);
537 * Character data handler - called from HTMLSax
539 * @param object $parser HTML parser
540 * @param string $data textual data
544 function _dataHandler(&$parser, $data)
546 if (count($this->_dcStack
) == 0) {
547 $this->_xhtml
.= $data;
553 * Escape handler - called from HTMLSax
555 * @param object $parser HTML parser
556 * @param string $data comments or other type of data
560 function _escapeHandler(&$parser, $data)
566 * Returns the XHTML document
568 * @return string Processed (X)HTML document
573 while ($tag = array_pop($this->_stack
)) {
574 $this->_closeTag($tag);
577 return $this->_xhtml
;
581 * Clears current document data
593 * Main parsing fuction
595 * @param string $doc HTML document for processing
596 * @return string Processed (X)HTML document
602 // Save all '<' symbols
603 $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '<', $doc);
605 // Web documents shouldn't contains \x00 symbol
606 $doc = str_replace("\x00", '', $doc);
608 // Opera6 bug workaround
609 $doc = str_replace("\xC0\xBC", '<', $doc);
611 if ($this->repackUTF7
) {
612 // UTF-7 encoding ASCII decode
613 $doc = $this->repackUTF7($doc);
616 // Instantiate the parser
617 $parser = new XML_HTMLSax3();
620 $parser->set_object($this);
622 $parser->set_element_handler('_openHandler','_closeHandler');
623 $parser->set_data_handler('_dataHandler');
624 $parser->set_escape_handler('_escapeHandler');
626 $parser->parse($doc);
628 return $this->getXHTML();
634 * UTF-7 decoding fuction
636 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
637 * @return string Decoded document
640 function repackUTF7($str)
642 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
646 * Additional UTF-7 decoding fuction
648 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
649 * @return string Recoded string
652 function repackUTF7Callback($str)
654 $str = base64_decode($str[1]);
655 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
656 return preg_replace('/\x00(.)/', '$1', $str);
660 * Additional UTF-7 encoding fuction
662 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
663 * @return string Recoded string
666 function repackUTF7Back($str)
668 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';