init
[garradin.git] / include / libs / garbage2xhtml / lib.garbage2xhtml.php
1 <?php
2 /*
3 Garbage2xhtml lib
4 Takes a html text and returns something semantic and maybe valid
5
6 Copyleft (C) 2006-11 BohwaZ - http://bohwaz.net/
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU Affero General Public License as
10 published by the Free Software Foundation, version 3 of the
11 License.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 class Garbage_Exception extends Exception
23 {
24 }
25
26 class garbage2xhtml
27 {
28 /**
29 * Secure attributes contents?
30 * Will check for url scheme and url content in href and src
31 * It's advised to disable <script> and <style> tags and style attribute
32 * because they could be used for XSS attacks
33 */
34 public $secure = true;
35
36 /**
37 * Enclose text which is not in any element in <p> tags?
38 */
39 public $enclose_text = true;
40
41 /**
42 * Auto-add <br /> in text blocks?
43 * Will also break <p> blocks when encountering a double line break.
44 *
45 * Example:
46 * <p>One Two
47 *
48 * Three
49 * Four
50 * </p>
51 *
52 * Will render as:
53 * <p>One Two</p>
54 * <p>Three<br />
55 * Four</p>
56 */
57 public $auto_br = true;
58
59 /**
60 * Text encoding (used for escaping)
61 */
62 public $encoding = 'UTF-8';
63
64 /**
65 * Remove forbidden tags from ouput?
66 * If true, "<em>" will disappear if it's not in allowed tags.
67 * If false, "<em>" will become a text node with &lt;em&gt;
68 */
69 public $remove_forbidden_tags = false;
70
71 /**
72 * Remove forbidden tags contents?
73 * If true "<b>Hello</b>" will become "" if <b> is not allowed
74 * If false "<b>Hello</b>" will become "Hello"
75 */
76 public $remove_forbidden_tags_content = false;
77
78 public $indent = true;
79
80 /**
81 * Core attributes allowed on each element
82 */
83 public $core_attributes = array('lang', 'class', 'id', 'title', 'dir');
84
85 /**
86 * Allowed block tags
87 *
88 * 'tag' => true, // Allows core attributes
89 * 'tag' => false, // Disallow core attributes
90 * 'tag' => array('allowed attribute 1', 'href', 'src'),
91 * // Allow core attributes and those specific attributes
92 */
93 public $block_tags = array(
94 'ul' => true,
95 'ol' => true,
96 'li' => true,
97 'dl' => true,
98
99 'p' => true,
100 'div' => true,
101
102 'h1' => true,
103 'h2' => true,
104 'h3' => true,
105 'h4' => true,
106 'h5' => true,
107 'h6' => true,
108
109 'pre' => true,
110 'hr' => true,
111 'address' => true,
112 'blockquote'=> array('cite'),
113
114 'object'=> array('type', 'width', 'height', 'data'),
115 'iframe'=> array('src', 'width', 'height', 'frameborder', 'scrolling'),
116
117 'table' => array('summary'),
118 'tbody' => true,
119 'thead' => true,
120 'tfoot' => true,
121 'caption' => true,
122 'colgroup' => array('span'),
123 'col' => true,
124 'tr' => true,
125 'th' => array('colspan', 'rowspan', 'scope', 'headers'),
126 'td' => array('colspan', 'rowspan', 'headers'),
127
128 // XHTML 5
129 'article' => true,
130 'aside' => true,
131 'audio' => array('src', 'controls', 'loop', 'preload'),
132 'figure' => true,
133 'footer' => true,
134 'header' => true,
135 'hgroup' => true,
136 'section' => true,
137 'video' => array('src', 'controls', 'width', 'height', 'poster'),
138
139 );
140
141 /**
142 * Allowed inline elements
143 */
144 public $inline_tags = array(
145 // 'tag' => array of allowed attributes
146 'abbr' => array('title'),
147 'dfn' => true,
148 'acronym' => array('title'),
149
150 'cite' => true,
151 'q' => array('cite'),
152
153 'code' => true,
154 'kbd' => true,
155 'samp' => true,
156
157 'strong'=> true,
158 'em' => true,
159
160 'small' => true,
161
162 'del' => true,
163 'ins' => true,
164 'sup' => true,
165 'sub' => true,
166
167 'dt' => true,
168 'dd' => true,
169
170 'span' => true,
171 'br' => false,
172
173 'a' => array('href', 'hreflang', 'rel'),
174 'img' => array('src', 'alt', 'width', 'height'),
175
176 'param' => array('name', 'value', 'type'),
177
178 // XHTML 5
179 'mark' => true,
180 'var' => true,
181 'time' => array('pubdate', 'datetime'),
182 'figcaption'=> true,
183 );
184
185 public $allowed_url_schemes = array(
186 'http' => '://',
187 'https' => '://',
188 'ftp' => '://',
189 'mailto'=> ':',
190 'xmpp' => ':',
191 'news' => ':',
192 'nntp' => '://',
193 'tel' => ':',
194 'callto'=> ':',
195 'ed2k' => '://',
196 'irc' => '://',
197 'magnet'=> ':',
198 'mms' => '://',
199 'rtsp' => '://',
200 'sip' => ':',
201 );
202
203 /**
204 * Tags who need content to be enclosed
205 */
206 public $elements_need_enclose = array('blockquote', 'form', 'address', 'noscript');
207
208 /**
209 * Tags elements who accept <br /> inside
210 */
211 public $elements_allow_break = array('p', 'dd', 'dt', 'li', 'td', 'th', 'div');
212
213 /**
214 * Autoclosing tags (eg. <br />)
215 */
216 public $autoclosing_tags = array('br', 'hr', 'img', 'param');
217
218 ///////// PRIVATE PROPERTIES
219
220 private $opened = array();
221 private $matches = array();
222 private $line = 0;
223 private $check_only = false;
224
225 private $allowed_tags = array();
226
227 const SPLIT_REGEXP = '!<(/?)([^><]*)>!';
228 const ATTRIBUTE_REGEXP = '/(?:(?:"[^"\\\\]*(?:\\\\.[^"\\\\]*)*"|\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\') | (?>[^"\'=\s]+))+|[=]/x';
229
230 public function parse($string)
231 {
232 $string = preg_replace('#<!--.*-->#Us', '', $string);
233 $string = preg_replace('#<!\[CDATA\[.*\]\]>#Us', '', $string);
234
235 $string = str_replace(array("\r\n", "\r"), "\n", $string);
236 $string = preg_replace('!<br\s*/?>!i', '<br />', $string);
237 $string = trim($string);
238
239 $this->resetInternals();
240 $this->allowed_tags = array_merge($this->inline_tags, $this->block_tags);
241
242 $this->matches = preg_split(self::SPLIT_REGEXP, $string, null, PREG_SPLIT_DELIM_CAPTURE);
243 unset($string);
244
245 $nodes = $this->buildTree();
246 $this->resetInternals();
247
248 return $nodes;
249 }
250
251 /**
252 * Checks a string validity
253 */
254 public function check($string)
255 {
256 $this->check_only = true;
257 $this->parse($string);
258 $this->check_only = false;
259 return true;
260 }
261
262 /**
263 * Processes a string
264 */
265 public function process($string)
266 {
267 $nodes = $this->parse($string);
268 unset($string);
269
270 if ($this->enclose_text)
271 {
272 $nodes = $this->encloseChildren($nodes, false);
273 }
274
275 if ($this->auto_br)
276 {
277 $nodes = $this->autoLineBreak($nodes);
278 }
279
280 return $this->outputNodes($nodes);
281 }
282
283 /**
284 * Outputs a string from a nodes array
285 */
286 public function outputNodes($nodes, $level = 0)
287 {
288 $out = '';
289
290 foreach ($nodes as $node)
291 {
292 if (is_array($node))
293 {
294 $close = '';
295 $content = '';
296 $open = '<'.$node['name'];
297
298 foreach ($node['attrs'] as $key=>$value)
299 {
300 $open .= ' '.$key.'="'.$value.'"';
301 }
302
303 if ($this->isTagAutoclosing($node['name']))
304 {
305 $open .= ' />';
306 }
307 else
308 {
309 $open .= '>';
310 $close = '</'.$node['name'].'>';
311 }
312
313 if (!empty($node['children']))
314 {
315 $content = $this->outputNodes($node['children'], $level + 1);
316 }
317
318 if ($close && $this->indent !== false && array_key_exists($node['name'], $this->block_tags) && $node['name'] != 'pre')
319 {
320 $tag = $this->indentTag($open, $content, $close, $level * ($this->indent === true ? 1 : (int) $this->indent));
321 }
322 else
323 {
324 $tag = $open . $content . $close;
325
326 if ($node['name'] == 'br')
327 $tag.= "\n";
328 }
329 }
330 else
331 {
332 $tag = $node;
333 }
334
335 $out .= $tag;
336 }
337
338 return $out;
339 }
340
341 private function indentTag($open, $content, $close, $indent)
342 {
343 $out = "\n";
344 $out.= str_repeat(' ', $indent);
345 $out.= $open;
346 $out.= "\n";
347
348 $content = explode("\n", $content);
349
350 foreach ($content as $line)
351 {
352 if (!trim($line))
353 continue;
354
355 $out.= str_repeat(' ', $indent + ($this->indent === true ? 2 : (int) $this->indent));
356 $out.= $line . "\n";
357 }
358
359 unset($content);
360
361 $out.= str_repeat(' ', $indent);
362 $out.= $close;
363 $out.= "\n";
364
365 return $out;
366 }
367
368 private function resetInternals()
369 {
370 $this->opened = array();
371 $this->matches = array();
372 $this->line = 0;
373 }
374
375 /**
376 * Break line and paragraphs following this rule :
377 * in a paragraph : one line break = <br />,
378 * two line breaks = closing paragraph and opening a new one
379 * in other elements : nl2br
380 */
381 private function autoLineBreak($nodes, $parent = false)
382 {
383 $n = array();
384 $nb_nodes = count($nodes);
385 $k = 0;
386
387 foreach ($nodes as $node)
388 {
389 // Text node inside an element allowing for line breaks
390 if (is_string($node) && in_array($parent, $this->elements_allow_break))
391 {
392 $matches = preg_split('!(\n+)!', $node, -1, PREG_SPLIT_DELIM_CAPTURE);
393 $i = 1;
394 $max = count($matches);
395
396 while (($line = array_shift($matches)) !== null)
397 {
398 // Line break
399 if ($i++ % 2 == 0)
400 {
401 if (!empty($n) && ($k < $nb_nodes - 1 || $i < $max))
402 {
403 $n[] = array('name' => 'br', 'attrs' => array(), 'children' => array());
404 }
405 }
406 elseif ($line != "")
407 {
408 $n[] = $line;
409 }
410 }
411 }
412 // In paragraphs we'll try to split them each two-line breaks
413 elseif (is_array($node) && $node['name'] == 'p')
414 {
415 $n[] = array('name' => 'p', 'attrs' => $node['attrs'], 'children' => array());
416 $current_node = count($n) - 1;
417
418 // Because we need to work on parent-level we will loop on children here
419 // (so we don't do recursive calls)
420 while (($child = array_shift($node['children'])) !== null)
421 {
422 // Text node ? try to split it
423 if (is_string($child))
424 {
425 $matches = preg_split('!(\n+)!', $child, -1, PREG_SPLIT_DELIM_CAPTURE);
426 $i = 0;
427 $max = count($matches);
428
429 while (($line = array_shift($matches)) !== null)
430 {
431 if ($i++ % 2)
432 {
433 // More than 2 line-breaks then we create a new paragraph and we continue
434 if (strlen($line) >= 2)
435 {
436 $n[] = array('name' => 'p', 'attrs' => $node['attrs'], 'children' => array());
437 $current_node = count($n) - 1;
438
439 $nb_nodes++;
440 $k++;
441 }
442 // Simple line break
443 // but no line break just after or before end
444 elseif (!empty($n[$current_node]['children']) && ($i - 1 < $max || $k < $nb_nodes - 1))
445 {
446 $n[$current_node]['children'][] = array('name' => 'br', 'attrs' => array(), 'children' => array());
447 }
448 }
449 elseif ($line != "")
450 {
451 $n[$current_node]['children'][] = $line;
452 }
453 }
454 }
455 else
456 {
457 $n[$current_node]['children'][] = $child;
458 }
459 }
460 }
461 else
462 {
463 if (!is_string($node) && !empty($node['children']))
464 {
465 $node['children'] = $this->autoLineBreak($node['children'], $node['name']);
466 }
467
468 $n[] = $node;
469 }
470
471 $k++;
472 }
473
474 unset($nodes);
475 return $n;
476 }
477
478 private function getTagAttributes($value, $tag)
479 {
480 $attributes = array();
481
482 if (array_key_exists($tag, $this->allowed_tags))
483 {
484 $tag =& $this->allowed_tags[$tag];
485 }
486 elseif (preg_match('!^[a-zA-Z0-9-]+:!', $tag, $match) && array_key_exists($match[0], $this->allowed_tags))
487 {
488 $tag =& $this->allowed_tags[$match[0]];
489 }
490
491 $value = preg_replace('!^.*\s+!U', '', $value);
492
493 if (preg_match_all(self::ATTRIBUTE_REGEXP, $value, $match))
494 {
495 $state = 0;
496 $name = false;
497
498 foreach($match[0] as $value)
499 {
500 if ($state == 0)
501 {
502 $name = strtolower((string) $value);
503 $state = 1;
504 $pass = false;
505
506 // Allowed attribute ?
507 if ($tag && in_array($name, $this->core_attributes))
508 $pass = true;
509 elseif (is_array($tag) && in_array($name, $tag))
510 $pass = true;
511 elseif (preg_match('!^(data-|[a-z0-9-]+:)!', $name, $m))
512 {
513 // Allow namespaces and data- (html5) attributes
514 if ($tag && in_array($m[1], $this->core_attributes))
515 $pass = true;
516 elseif (is_array($tag) && in_array($m[1], $tag))
517 $pass = true;
518 }
519
520 if (!$pass)
521 {
522 $name = false;
523 continue;
524 }
525 }
526 elseif ($state == 1)
527 {
528 if ($value != '=' && $name && $this->check_only)
529 throw new Garbage_Exception("Expecting '=' after $name on line ".$this->line);
530
531 $state = 2;
532 }
533 elseif ($state == 2)
534 {
535 $state = 0;
536
537 if (!$name)
538 continue;
539
540 if ($value == '=' && $this->check_only)
541 throw new Garbage_Exception("Unexpected '=' after $name on line ".$this->line);
542
543 if ($value[0] == '"' || $value[0] == "'")
544 $value = substr($value, 1, -1);
545
546 $value = $this->protectAttribute($name, $value);
547
548 $attributes[$name] = $value;
549 }
550 }
551 }
552
553 return $attributes;
554 }
555
556 private function decodeObfuscated($value)
557 {
558 // Don't try to trick me
559 $value = rawurldecode($value);
560 $value = html_entity_decode($value, ENT_QUOTES, $this->encoding);
561
562 // unicode entities don't always have a semicolon ending the entity
563 $value = preg_replace_callback('~&#x0*([0-9a-f]+);?~i', function ($match) {
564 return chr(hexdec($match[1]));
565 }, $value);
566
567 $value = preg_replace_callback('~&#0*([0-9]+);?~', function ($match) {
568 return chr($match[1]);
569 }, $value);
570
571 return $value;
572 }
573
574 private function protectAttribute($name, $value)
575 {
576 if (!$this->secure)
577 return $str;
578
579 if ($name == 'src' || $name == 'href')
580 {
581 $value = self::decodeObfuscated($value);
582
583 // parse_url already have some tricks against XSS
584 $url = parse_url($value);
585 $value = '';
586
587 if (!empty($url['scheme']))
588 {
589 $url['scheme'] = strtolower($url['scheme']);
590
591 if (!array_key_exists($url['scheme'], $this->allowed_url_schemes))
592 return '';
593
594 $value .= $url['scheme'] . $this->allowed_url_schemes[$url['scheme']];
595 }
596
597 if (!empty($url['host']))
598 {
599 $value .= $url['host'];
600 }
601
602 if (!empty($url['path']))
603 {
604 $value .= $url['path'];
605 }
606
607 if (!empty($url['query']))
608 {
609 // We can't use parse_str and build_http_string to sanitize url here
610 // Or else we'll get things like ?param1&param2 transformed in ?param1=&param2=
611 $query = explode('&', $url['query']);
612
613 foreach ($query as &$item)
614 {
615 $item = explode('=', $item);
616
617 if (isset($item[1]))
618 $item = rawurlencode(rawurldecode($item[0])) . '=' . rawurlencode(rawurldecode($item[1]));
619 else
620 $item = rawurlencode(rawurldecode($item[0]));
621 }
622
623 $value .= '?' . $this->escape(implode('&', $query));
624 }
625
626 if (!empty($url['fragment']))
627 {
628 $value .= '#' . $url['fragment'];
629 }
630 }
631 else
632 {
633 $value = str_replace('&amp;', '&', $value);
634 $value = $this->cleanEntities($value);
635 $value = $this->escape($value);
636 }
637
638 return $value;
639 }
640
641 private function getTagName($value)
642 {
643 $value = trim($value);
644
645 if (preg_match('!^([a-zA-Z0-9-]+)(?:[:]([a-zA-Z0-9-]+))?!', $value, $match))
646 {
647 if (!empty($match[2]) && array_key_exists($match[1] . ':', $this->allowed_tags))
648 return $match[0];
649 elseif (array_key_exists($match[0], $this->allowed_tags))
650 return $match[0];
651 }
652
653 return false;
654 }
655
656 private function isTagAutoclosing($tag)
657 {
658 if (in_array($tag, $this->autoclosing_tags))
659 return true;
660
661 if (preg_match('!^[a-zA-Z0-9-]+:!', $tag, $match) && in_array($match[0], $this->autoclosing_tags))
662 return true;
663
664 return false;
665 }
666
667 /**
668 * Build HTML tree
669 */
670 private function buildTree()
671 {
672 $i = 0;
673 $nodes = array();
674 $closing = false;
675 $in_forbidden_tag = false;
676
677 while (($value = array_shift($this->matches)) !== null)
678 {
679 // Line count
680 $this->line += (int) substr_count($value, "\n");
681
682 switch ($i++ % 3)
683 {
684 // Text node
685 case 0:
686 {
687 if ($value != "" && !$this->check_only
688 && !($in_forbidden_tag && $this->remove_forbidden_tags_content))
689 {
690 $nodes[] = $this->escape($value);
691 }
692 break;
693 }
694
695 // Next iteration is closing tag (probably ?)
696 case 1:
697 {
698 $closing = ($value == '/');
699 break;
700 }
701
702 // Tag itself
703 case 2:
704 {
705 $tag = $this->getTagName($value);
706
707 // Self-closing tag
708 if (substr($value, -1, 1) == '/' || $this->isTagAutoclosing($tag))
709 {
710 $value = preg_replace('!\s*/$!', '', $value);
711
712 // Dismis un-authorized tag
713 if (!$tag)
714 {
715 if ($this->check_only)
716 throw new Garbage_Exception("Un-authorized tag <$value>");
717
718 if (!$this->remove_forbidden_tags)
719 $nodes[] = '&lt;'.$this->escape($value).' /&gt;';
720
721 $in_forbidden_tag = false;
722
723 continue;
724 }
725
726 if (!$this->check_only)
727 {
728 $nodes[] = array(
729 'name' => $tag,
730 'attrs' => $this->getTagAttributes($value, $tag),
731 'children'=> array(),
732 );
733 }
734 }
735 // Closing tag
736 else if ($closing)
737 {
738 // Dismis un-authorized tag
739 if (!$tag)
740 {
741 if (!$this->remove_forbidden_tags)
742 $nodes[] = '&lt;/'.$this->escape($value).'&gt;';
743
744 continue;
745 }
746
747 $open = array_pop($this->opened);
748
749 // Uh-oh parse error !
750 // We could try to just dismiss tag errors or repair dirty HTML but
751 // it's too complicated. Just write valid xHTML.
752 if ($value != $open)
753 {
754 if ($this->check_only)
755 throw new Garbage_Exception("Tag <$value> closed, which is not open, on line ".$this->line);
756 }
757
758 return $nodes;
759 }
760 // Opening tag
761 else
762 {
763 if (!$tag)
764 {
765 if ($this->check_only)
766 throw new Garbage_Exception("Invalid tag <$value>");
767
768 if (!$this->remove_forbidden_tags)
769 $nodes[] = '&lt;'.$this->escape($value).'&gt;';
770
771 $in_forbidden_tag = true;
772
773 continue;
774 }
775
776 if (!$this->check_only)
777 {
778 $node = array(
779 'name' => $tag,
780 'attrs' => $this->getTagAttributes($value, $tag),
781 'children'=> array(),
782 );
783 }
784
785 $this->opened[] = $tag;
786
787 if ($this->check_only)
788 {
789 $this->buildTree();
790 }
791 else
792 {
793 // Build child tree
794 $node['children'] = $this->buildTree();
795
796 // You need to enclose text in paragraphs in some tags
797 // (Yes, read the XHTML spec)
798 $node['children'] = $this->encloseChildren($node['children'], $node['name']);
799
800 $nodes[] = $node;
801 }
802 }
803 }
804 }
805 }
806
807 return $nodes;
808 }
809
810 /**
811 * Enclose sub elements which need to be enclosed
812 */
813 private function encloseChildren($children, $parent)
814 {
815 if (!empty($children) && (in_array($parent, $this->elements_need_enclose) || !$parent))
816 {
817 $n = array();
818 $open = false;
819
820 while (($child = array_shift($children)) !== NULL)
821 {
822 if (is_string($child) || !array_key_exists($child['name'], $this->block_tags))
823 {
824 if ($open === false)
825 {
826 $open = count($n);
827 $n[$open] = array('name' => 'p', 'attrs' => array(), 'children' => array());
828 }
829
830 $n[$open]['children'][] = $child;
831 }
832 else
833 {
834 $open = false;
835
836 $n[] = $child;
837 }
838 }
839
840 $children = $n;
841 unset($n, $open, $child);
842 }
843
844 return $children;
845 }
846
847 public function escape($str)
848 {
849 $out = htmlspecialchars($str, ENT_QUOTES, $this->encoding, false);
850
851 if (empty($out) && !empty($str))
852 {
853 throw new Garbage_Exception("Encoding error.");
854 }
855
856 return $out;
857 }
858
859 /**
860 * Clean entities
861 */
862 private function cleanEntities($str)
863 {
864 return preg_replace('/&amp;(#[0-9a-fx]+|[a-z]+);/i', '&\\1;', $str);
865 }
866 }
867
868 ?>