4 Takes a html text and returns something semantic and maybe valid
6 Copyleft (C) 2006-11 BohwaZ - http://bohwaz.net/
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU Affero General Public License as
10 published by the Free Software Foundation, version 3 of the
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Affero General Public License for more details.
18 You should have received a copy of the GNU Affero General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 class Garbage_Exception
extends Exception
29 * Secure attributes contents?
30 * Will check for url scheme and url content in href and src
31 * It's advised to disable <script> and <style> tags and style attribute
32 * because they could be used for XSS attacks
34 public $secure = true;
37 * Enclose text which is not in any element in <p> tags?
39 public $enclose_text = true;
42 * Auto-add <br /> in text blocks?
43 * Will also break <p> blocks when encountering a double line break.
57 public $auto_br = true;
60 * Text encoding (used for escaping)
62 public $encoding = 'UTF-8';
65 * Remove forbidden tags from ouput?
66 * If true, "<em>" will disappear if it's not in allowed tags.
67 * If false, "<em>" will become a text node with <em>
69 public $remove_forbidden_tags = false;
72 * Remove forbidden tags contents?
73 * If true "<b>Hello</b>" will become "" if <b> is not allowed
74 * If false "<b>Hello</b>" will become "Hello"
76 public $remove_forbidden_tags_content = false;
78 public $indent = true;
81 * Core attributes allowed on each element
83 public $core_attributes = array('lang', 'class', 'id', 'title', 'dir');
88 * 'tag' => true, // Allows core attributes
89 * 'tag' => false, // Disallow core attributes
90 * 'tag' => array('allowed attribute 1', 'href', 'src'),
91 * // Allow core attributes and those specific attributes
93 public $block_tags = array(
112 'blockquote'=> array('cite'),
114 'object'=> array('type', 'width', 'height', 'data'),
115 'iframe'=> array('src', 'width', 'height', 'frameborder', 'scrolling'),
117 'table' => array('summary'),
122 'colgroup' => array('span'),
125 'th' => array('colspan', 'rowspan', 'scope', 'headers'),
126 'td' => array('colspan', 'rowspan', 'headers'),
131 'audio' => array('src', 'controls', 'loop', 'preload'),
137 'video' => array('src', 'controls', 'width', 'height', 'poster'),
142 * Allowed inline elements
144 public $inline_tags = array(
145 // 'tag' => array of allowed attributes
146 'abbr' => array('title'),
148 'acronym' => array('title'),
151 'q' => array('cite'),
173 'a' => array('href', 'hreflang', 'rel'),
174 'img' => array('src', 'alt', 'width', 'height'),
176 'param' => array('name', 'value', 'type'),
181 'time' => array('pubdate', 'datetime'),
185 public $allowed_url_schemes = array(
204 * Tags who need content to be enclosed
206 public $elements_need_enclose = array('blockquote', 'form', 'address', 'noscript');
209 * Tags elements who accept <br /> inside
211 public $elements_allow_break = array('p', 'dd', 'dt', 'li', 'td', 'th', 'div');
214 * Autoclosing tags (eg. <br />)
216 public $autoclosing_tags = array('br', 'hr', 'img', 'param');
218 ///////// PRIVATE PROPERTIES
220 private $opened = array();
221 private $matches = array();
223 private $check_only = false;
225 private $allowed_tags = array();
227 const SPLIT_REGEXP
= '!<(/?)([^><]*)>!';
228 const ATTRIBUTE_REGEXP
= '/(?:(?:"[^"\\\\]*(?:\\\\.[^"\\\\]*)*"|\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\') | (?>[^"\'=\s]+))+|[=]/x';
230 public function parse($string)
232 $string = preg_replace('#<!--.*-->#Us', '', $string);
233 $string = preg_replace('#<!\[CDATA\[.*\]\]>#Us', '', $string);
235 $string = str_replace(array("\r\n", "\r"), "\n", $string);
236 $string = preg_replace('!<br\s*/?>!i', '<br />', $string);
237 $string = trim($string);
239 $this->resetInternals();
240 $this->allowed_tags
= array_merge($this->inline_tags
, $this->block_tags
);
242 $this->matches
= preg_split(self
::SPLIT_REGEXP
, $string, null, PREG_SPLIT_DELIM_CAPTURE
);
245 $nodes = $this->buildTree();
246 $this->resetInternals();
252 * Checks a string validity
254 public function check($string)
256 $this->check_only
= true;
257 $this->parse($string);
258 $this->check_only
= false;
265 public function process($string)
267 $nodes = $this->parse($string);
270 if ($this->enclose_text
)
272 $nodes = $this->encloseChildren($nodes, false);
277 $nodes = $this->autoLineBreak($nodes);
280 return $this->outputNodes($nodes);
284 * Outputs a string from a nodes array
286 public function outputNodes($nodes, $level = 0)
290 foreach ($nodes as $node)
296 $open = '<'.$node['name'];
298 foreach ($node['attrs'] as $key=>$value)
300 $open .= ' '.$key.'="'.$value.'"';
303 if ($this->isTagAutoclosing($node['name']))
310 $close = '</'.$node['name'].'>';
313 if (!empty($node['children']))
315 $content = $this->outputNodes($node['children'], $level +
1);
318 if ($close && $this->indent
!== false && array_key_exists($node['name'], $this->block_tags
) && $node['name'] != 'pre')
320 $tag = $this->indentTag($open, $content, $close, $level * ($this->indent
=== true ?
1 : (int) $this->indent
));
324 $tag = $open . $content . $close;
326 if ($node['name'] == 'br')
341 private function indentTag($open, $content, $close, $indent)
344 $out.= str_repeat(' ', $indent);
348 $content = explode("\n", $content);
350 foreach ($content as $line)
355 $out.= str_repeat(' ', $indent +
($this->indent
=== true ?
2 : (int) $this->indent
));
361 $out.= str_repeat(' ', $indent);
368 private function resetInternals()
370 $this->opened
= array();
371 $this->matches
= array();
376 * Break line and paragraphs following this rule :
377 * in a paragraph : one line break = <br />,
378 * two line breaks = closing paragraph and opening a new one
379 * in other elements : nl2br
381 private function autoLineBreak($nodes, $parent = false)
384 $nb_nodes = count($nodes);
387 foreach ($nodes as $node)
389 // Text node inside an element allowing for line breaks
390 if (is_string($node) && in_array($parent, $this->elements_allow_break
))
392 $matches = preg_split('!(\n+)!', $node, -1, PREG_SPLIT_DELIM_CAPTURE
);
394 $max = count($matches);
396 while (($line = array_shift($matches)) !== null)
401 if (!empty($n) && ($k < $nb_nodes - 1 ||
$i < $max))
403 $n[] = array('name' => 'br', 'attrs' => array(), 'children' => array());
412 // In paragraphs we'll try to split them each two-line breaks
413 elseif (is_array($node) && $node['name'] == 'p')
415 $n[] = array('name' => 'p', 'attrs' => $node['attrs'], 'children' => array());
416 $current_node = count($n) - 1;
418 // Because we need to work on parent-level we will loop on children here
419 // (so we don't do recursive calls)
420 while (($child = array_shift($node['children'])) !== null)
422 // Text node ? try to split it
423 if (is_string($child))
425 $matches = preg_split('!(\n+)!', $child, -1, PREG_SPLIT_DELIM_CAPTURE
);
427 $max = count($matches);
429 while (($line = array_shift($matches)) !== null)
433 // More than 2 line-breaks then we create a new paragraph and we continue
434 if (strlen($line) >= 2)
436 $n[] = array('name' => 'p', 'attrs' => $node['attrs'], 'children' => array());
437 $current_node = count($n) - 1;
443 // but no line break just after or before end
444 elseif (!empty($n[$current_node]['children']) && ($i - 1 < $max ||
$k < $nb_nodes - 1))
446 $n[$current_node]['children'][] = array('name' => 'br', 'attrs' => array(), 'children' => array());
451 $n[$current_node]['children'][] = $line;
457 $n[$current_node]['children'][] = $child;
463 if (!is_string($node) && !empty($node['children']))
465 $node['children'] = $this->autoLineBreak($node['children'], $node['name']);
478 private function getTagAttributes($value, $tag)
480 $attributes = array();
482 if (array_key_exists($tag, $this->allowed_tags
))
484 $tag =& $this->allowed_tags
[$tag];
486 elseif (preg_match('!^[a-zA-Z0-9-]+:!', $tag, $match) && array_key_exists($match[0], $this->allowed_tags
))
488 $tag =& $this->allowed_tags
[$match[0]];
491 $value = preg_replace('!^.*\s+!U', '', $value);
493 if (preg_match_all(self
::ATTRIBUTE_REGEXP
, $value, $match))
498 foreach($match[0] as $value)
502 $name = strtolower((string) $value);
506 // Allowed attribute ?
507 if ($tag && in_array($name, $this->core_attributes
))
509 elseif (is_array($tag) && in_array($name, $tag))
511 elseif (preg_match('!^(data-|[a-z0-9-]+:)!', $name, $m))
513 // Allow namespaces and data- (html5) attributes
514 if ($tag && in_array($m[1], $this->core_attributes
))
516 elseif (is_array($tag) && in_array($m[1], $tag))
528 if ($value != '=' && $name && $this->check_only
)
529 throw new Garbage_Exception("Expecting '=' after $name on line ".$this->line
);
540 if ($value == '=' && $this->check_only
)
541 throw new Garbage_Exception("Unexpected '=' after $name on line ".$this->line
);
543 if ($value[0] == '"' ||
$value[0] == "'")
544 $value = substr($value, 1, -1);
546 $value = $this->protectAttribute($name, $value);
548 $attributes[$name] = $value;
556 private function decodeObfuscated($value)
558 // Don't try to trick me
559 $value = rawurldecode($value);
560 $value = html_entity_decode($value, ENT_QUOTES
, $this->encoding
);
562 // unicode entities don't always have a semicolon ending the entity
563 $value = preg_replace_callback('~�*([0-9a-f]+);?~i', function ($match) {
564 return chr(hexdec($match[1]));
567 $value = preg_replace_callback('~�*([0-9]+);?~', function ($match) {
568 return chr($match[1]);
574 private function protectAttribute($name, $value)
579 if ($name == 'src' ||
$name == 'href')
581 $value = self
::decodeObfuscated($value);
583 // parse_url already have some tricks against XSS
584 $url = parse_url($value);
587 if (!empty($url['scheme']))
589 $url['scheme'] = strtolower($url['scheme']);
591 if (!array_key_exists($url['scheme'], $this->allowed_url_schemes
))
594 $value .= $url['scheme'] . $this->allowed_url_schemes
[$url['scheme']];
597 if (!empty($url['host']))
599 $value .= $url['host'];
602 if (!empty($url['path']))
604 $value .= $url['path'];
607 if (!empty($url['query']))
609 // We can't use parse_str and build_http_string to sanitize url here
610 // Or else we'll get things like ?param1¶m2 transformed in ?param1=¶m2=
611 $query = explode('&', $url['query']);
613 foreach ($query as &$item)
615 $item = explode('=', $item);
618 $item = rawurlencode(rawurldecode($item[0])) . '=' . rawurlencode(rawurldecode($item[1]));
620 $item = rawurlencode(rawurldecode($item[0]));
623 $value .= '?' . $this->escape(implode('&', $query));
626 if (!empty($url['fragment']))
628 $value .= '#' . $url['fragment'];
633 $value = str_replace('&', '&', $value);
634 $value = $this->cleanEntities($value);
635 $value = $this->escape($value);
641 private function getTagName($value)
643 $value = trim($value);
645 if (preg_match('!^([a-zA-Z0-9-]+)(?:[:]([a-zA-Z0-9-]+))?!', $value, $match))
647 if (!empty($match[2]) && array_key_exists($match[1] . ':', $this->allowed_tags
))
649 elseif (array_key_exists($match[0], $this->allowed_tags
))
656 private function isTagAutoclosing($tag)
658 if (in_array($tag, $this->autoclosing_tags
))
661 if (preg_match('!^[a-zA-Z0-9-]+:!', $tag, $match) && in_array($match[0], $this->autoclosing_tags
))
670 private function buildTree()
675 $in_forbidden_tag = false;
677 while (($value = array_shift($this->matches
)) !== null)
680 $this->line +
= (int) substr_count($value, "\n");
687 if ($value != "" && !$this->check_only
688 && !($in_forbidden_tag && $this->remove_forbidden_tags_content
))
690 $nodes[] = $this->escape($value);
695 // Next iteration is closing tag (probably ?)
698 $closing = ($value == '/');
705 $tag = $this->getTagName($value);
708 if (substr($value, -1, 1) == '/' ||
$this->isTagAutoclosing($tag))
710 $value = preg_replace('!\s*/$!', '', $value);
712 // Dismis un-authorized tag
715 if ($this->check_only
)
716 throw new Garbage_Exception("Un-authorized tag <$value>");
718 if (!$this->remove_forbidden_tags
)
719 $nodes[] = '<'.$this->escape($value).' />';
721 $in_forbidden_tag = false;
726 if (!$this->check_only
)
730 'attrs' => $this->getTagAttributes($value, $tag),
731 'children'=> array(),
738 // Dismis un-authorized tag
741 if (!$this->remove_forbidden_tags
)
742 $nodes[] = '</'.$this->escape($value).'>';
747 $open = array_pop($this->opened
);
749 // Uh-oh parse error !
750 // We could try to just dismiss tag errors or repair dirty HTML but
751 // it's too complicated. Just write valid xHTML.
754 if ($this->check_only
)
755 throw new Garbage_Exception("Tag <$value> closed, which is not open, on line ".$this->line
);
765 if ($this->check_only
)
766 throw new Garbage_Exception("Invalid tag <$value>");
768 if (!$this->remove_forbidden_tags
)
769 $nodes[] = '<'.$this->escape($value).'>';
771 $in_forbidden_tag = true;
776 if (!$this->check_only
)
780 'attrs' => $this->getTagAttributes($value, $tag),
781 'children'=> array(),
785 $this->opened
[] = $tag;
787 if ($this->check_only
)
794 $node['children'] = $this->buildTree();
796 // You need to enclose text in paragraphs in some tags
797 // (Yes, read the XHTML spec)
798 $node['children'] = $this->encloseChildren($node['children'], $node['name']);
811 * Enclose sub elements which need to be enclosed
813 private function encloseChildren($children, $parent)
815 if (!empty($children) && (in_array($parent, $this->elements_need_enclose
) ||
!$parent))
820 while (($child = array_shift($children)) !== NULL)
822 if (is_string($child) ||
!array_key_exists($child['name'], $this->block_tags
))
827 $n[$open] = array('name' => 'p', 'attrs' => array(), 'children' => array());
830 $n[$open]['children'][] = $child;
841 unset($n, $open, $child);
847 public function escape($str)
849 $out = htmlspecialchars($str, ENT_QUOTES
, $this->encoding
, false);
851 if (empty($out) && !empty($str))
853 throw new Garbage_Exception("Encoding error.");
862 private function cleanEntities($str)
864 return preg_replace('/&(#[0-9a-fx]+|[a-z]+);/i', '&\\1;', $str);