[SPIP][PLUGINS] v3.0-->v3.2
[lhc/web/www.git] / www / plugins-dist / safehtml / lib / safehtml / classes / safehtml.php
1 <?php
2
3 /**
4 * SafeHTML Parser
5 *
6 * @note
7 * Attention : Quelques modifications pour PHP 5.5 et 7
8 *
9 * @package SafeHTML
10 * @author Roman Ivanov <thingol@mail.ru>
11 * @copyright 2004-2005 Roman Ivanov
12 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
13 * @version 1.3.7
14 * @link http://pixel-apes.com/safehtml/
15 */
16
17
18 if (!defined('_ECRIRE_INC_VERSION')) return;
19
20 require_once(XML_HTMLSAX3 . 'HTMLSax3.php');
21
22 /**
23 *
24 * SafeHTML Parser
25 *
26 * This parser strips down all potentially dangerous content within HTML:
27 * <ul>
28 * <li>opening tag without its closing tag</li>
29 * <li>closing tag without its opening tag</li>
30 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
31 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
32 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
33 * <li>any of these attributes: on*, data*, dynsrc</li>
34 * <li>javascript:/vbscript:/about: etc. protocols</li>
35 * <li>expression/behavior etc. in styles</li>
36 * <li>any other active content</li>
37 * </ul>
38 * It also tries to convert code to XHTML valid, but htmltidy is far better
39 * solution for this task.
40 *
41 * <b>Example:</b>
42 * <pre>
43 * $parser =& new SafeHTML();
44 * $result = $parser->parse($doc);
45 * </pre>
46 *
47 * @category HTML
48 * @package SafeHTML
49 * @author Roman Ivanov <thingol@mail.ru>
50 * @copyright 1997-2005 Roman Ivanov
51 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
52 * @version Release: @package_version@
53 * @link http://pear.php.net/package/SafeHTML
54 */
55 class SafeHTML
56 {
57 /**
58 * Storage for resulting HTML output
59 *
60 * @var string
61 * @access private
62 */
63 var $_xhtml = '';
64
65 /**
66 * Array of counters for each tag
67 *
68 * @var array
69 * @access private
70 */
71 var $_counter = array();
72
73 /**
74 * Stack of unclosed tags
75 *
76 * @var array
77 * @access private
78 */
79 var $_stack = array();
80
81 /**
82 * Array of counters for tags that must be deleted with all content
83 *
84 * @var array
85 * @access private
86 */
87 var $_dcCounter = array();
88
89 /**
90 * Stack of unclosed tags that must be deleted with all content
91 *
92 * @var array
93 * @access private
94 */
95 var $_dcStack = array();
96
97 /**
98 * Stores level of list (ol/ul) nesting
99 *
100 * @var int
101 * @access private
102 */
103 var $_listScope = 0;
104
105 /**
106 * Stack of unclosed list tags
107 *
108 * @var array
109 * @access private
110 */
111 var $_liStack = array();
112
113 /**
114 * Array of prepared regular expressions for protocols (schemas) matching
115 *
116 * @var array
117 * @access private
118 */
119 var $_protoRegexps = array();
120
121 /**
122 * Array of prepared regular expressions for CSS matching
123 *
124 * @var array
125 * @access private
126 */
127 var $_cssRegexps = array();
128
129 /**
130 * Should we perform UTF7 repacking or not?
131 *
132 * This repacking might replace completely normal strings such as "+31-" by illegal sequences,
133 * which cause the document to be truncated on saving to MySQL
134 *
135 * @var boolean
136 * @access public
137 */
138 var $repackUTF7 = true;
139
140 /**
141 * List of single tags ("<tag />")
142 *
143 * @var array
144 * @access public
145 */
146 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
147
148 /**
149 * List of dangerous tags (such tags will be deleted)
150 *
151 * @var array
152 * @access public
153 */
154 var $deleteTags = array(
155 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
156 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
157 'iframe', 'layer', 'link', 'meta', 'object', 'style',
158 'title', 'script',
159 );
160
161 /**
162 * List of dangerous tags (such tags will be deleted, and all content
163 * inside this tags will be also removed)
164 *
165 * @var array
166 * @access public
167 */
168 var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
169
170 /**
171 * Type of protocols filtering ('white' or 'black')
172 *
173 * @var string
174 * @access public
175 */
176 var $protocolFiltering = 'white';
177
178 /**
179 * List of "dangerous" protocols (used for blacklist-filtering)
180 *
181 * @var array
182 * @access public
183 */
184 var $blackProtocols = array(
185 'about', 'chrome', 'data', 'disk', 'hcp',
186 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
187 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
188 'res', 'resource', 'shell', 'vbscript', 'view-source',
189 'vnd.ms.radio', 'wysiwyg',
190 );
191
192 /**
193 * List of "safe" protocols (used for whitelist-filtering)
194 *
195 * @var array
196 * @access public
197 */
198 var $whiteProtocols = array(
199 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
200 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
201 'xmpp', 'callto',
202 );
203
204 /**
205 * List of attributes that can contain protocols
206 *
207 * @var array
208 * @access public
209 */
210 var $protocolAttributes = array(
211 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', 'formaction',
212 );
213
214 /**
215 * List of dangerous CSS keywords
216 *
217 * Whole style="" attribute will be removed, if parser will find one of
218 * these keywords
219 *
220 * @var array
221 * @access public
222 */
223 var $cssKeywords = array(
224 'absolute', 'behavior', 'behaviour', 'content', 'expression',
225 'fixed', 'include-source', 'moz-binding',
226 );
227
228 /**
229 * List of tags that can have no "closing tag"
230 *
231 * @var array
232 * @access public
233 * @deprecated XHTML does not allow such tags
234 */
235 var $noClose = array();
236
237 /**
238 * List of block-level tags that terminates paragraph
239 *
240 * Paragraph will be closed when this tags opened
241 *
242 * @var array
243 * @access public
244 */
245 var $closeParagraph = array(
246 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
247 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
248 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
249 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
250 'table', 'ul', 'xmp',
251 );
252
253 /**
254 * List of table tags, all table tags outside a table will be removed
255 *
256 * @var array
257 * @access public
258 */
259 var $tableTags = array(
260 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
261 'thead', 'tr',
262 );
263
264 /**
265 * List of list tags
266 *
267 * @var array
268 * @access public
269 */
270 var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
271
272 /**
273 * List of dangerous attributes
274 *
275 * @var array
276 * @access public
277 */
278 var $attributes = array('dynsrc', 'id', 'name', );
279
280 /**
281 * List of allowed "namespaced" attributes
282 *
283 * @var array
284 * @access public
285 */
286 var $attributesNS = array('xml:lang', );
287
288 /**
289 * Constructs class
290 *
291 * @access public
292 */
293 function __contruct()
294 {
295 //making regular expressions based on Proto & CSS arrays
296 foreach ($this->blackProtocols as $proto) {
297 $preg = "/[\s\x01-\x1F]*";
298 for ($i=0; $i<strlen($proto); $i++) {
299 $preg .= $proto{$i} . "[\s\x01-\x1F]*";
300 }
301 $preg .= ":/i";
302 $this->_protoRegexps[] = $preg;
303 }
304
305 foreach ($this->cssKeywords as $css) {
306 $this->_cssRegexps[] = '/' . $css . '/i';
307 }
308 return true;
309 }
310
311 /**
312 * Handles the writing of attributes - called from $this->_openHandler()
313 *
314 * @param array $attrs array of attributes $name => $value
315 * @return boolean
316 * @access private
317 */
318 function _writeAttrs ($attrs, $tag = null)
319 {
320 if (is_array($attrs)) {
321 foreach ($attrs as $name => $value) {
322
323 $name = strtolower($name);
324
325 if (strpos($name, 'on') === 0) {
326 continue;
327 }
328 if (strpos($name, 'data') === 0) {
329 continue;
330 }
331 if ($tag != 'a' and in_array($name, $this->attributes)) {
332 continue;
333 }
334 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
335 if (!in_array($name, $this->attributesNS))
336 {
337 continue;
338 }
339 }
340
341 if (($value === TRUE) || (is_null($value))) {
342 $value = $name;
343 }
344
345 if ($name == 'style') {
346
347 // removes insignificant backslahes
348 $value = str_replace("\\", '', $value);
349
350 // removes CSS comments
351 while (1)
352 {
353 $_value = preg_replace("!/\*.*?\*/!s", '', $value);
354 if ($_value == $value) break;
355 $value = $_value;
356 }
357
358 // replace all & to &amp;
359 $value = str_replace('&amp;', '&', $value);
360 $value = str_replace('&', '&amp;', $value);
361
362 foreach ($this->_cssRegexps as $css) {
363 if (preg_match($css, $value)) {
364 continue 2;
365 }
366 }
367 foreach ($this->_protoRegexps as $proto) {
368 if (preg_match($proto, $value)) {
369 continue 2;
370 }
371 }
372 }
373
374 $tempval = preg_replace_callback('/&#(\d+);?/m', create_function('$m', 'return chr($m[1]);'), $value); //"'
375 $tempval = preg_replace_callback('/&#x([0-9a-f]+);?/mi', create_function('$m', 'return chr(hexdec($m[1]));'), $tempval); //"'
376
377 if ((in_array($name, $this->protocolAttributes)) &&
378 (strpos($tempval, ':') !== false))
379 {
380 if ($this->protocolFiltering == 'black') {
381 foreach ($this->_protoRegexps as $proto) {
382 if (preg_match($proto, $tempval)) continue 2;
383 }
384 } else {
385 $_tempval = explode(':', $tempval);
386 $proto = $_tempval[0];
387 if (!in_array($proto, $this->whiteProtocols)) {
388 continue;
389 }
390 }
391 }
392
393 $value = str_replace("\"", "&quot;", $value);
394 $this->_xhtml .= ' ' . $name . '="' . $value . '"';
395 }
396 }
397 return true;
398 }
399
400 /**
401 * Opening tag handler - called from HTMLSax
402 *
403 * @param object $parser HTML Parser
404 * @param string $name tag name
405 * @param array $attrs tag attributes
406 * @return boolean
407 * @access private
408 */
409 function _openHandler(&$parser, $name, $attrs)
410 {
411 $name = strtolower($name);
412
413 if (in_array($name, $this->deleteTagsContent)) {
414 array_push($this->_dcStack, $name);
415 $this->_dcCounter[$name] = isset($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+1 : 1;
416 }
417 if (count($this->_dcStack) != 0) {
418 return true;
419 }
420
421 if (in_array($name, $this->deleteTags)) {
422 return true;
423 }
424
425 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
426 if (preg_match("!(?:\@|://)!i", $name)) {
427 $this->_xhtml .= '&lt;' . $name . '&gt;';
428 }
429 return true;
430 }
431
432 if (in_array($name, $this->singleTags)) {
433 $this->_xhtml .= '<' . $name;
434 $this->_writeAttrs($attrs, $name);
435 $this->_xhtml .= ' />';
436 return true;
437 }
438
439 // TABLES: cannot open table elements when we are not inside table
440 if ((isset($this->_counter['table'])) && ($this->_counter['table'] <= 0)
441 && (in_array($name, $this->tableTags)))
442 {
443 return true;
444 }
445
446 // PARAGRAPHS: close paragraph when closeParagraph tags opening
447 if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->_stack))) {
448 $this->_closeHandler($parser, 'p');
449 }
450
451 // LISTS: we should close <li> if <li> of the same level opening
452 if ($name == 'li' && count($this->_liStack) &&
453 $this->_listScope == $this->_liStack[count($this->_liStack)-1])
454 {
455 $this->_closeHandler($parser, 'li');
456 }
457
458 // LISTS: we want to know on what nesting level of lists we are
459 if (in_array($name, $this->listTags)) {
460 $this->_listScope++;
461 }
462 if ($name == 'li') {
463 array_push($this->_liStack, $this->_listScope);
464 }
465
466 $this->_xhtml .= '<' . $name;
467 $this->_writeAttrs($attrs, $name);
468 $this->_xhtml .= '>';
469 array_push($this->_stack,$name);
470 $this->_counter[$name] = isset($this->_counter[$name]) ? $this->_counter[$name]+1 : 1;
471 return true;
472 }
473
474 /**
475 * Closing tag handler - called from HTMLSax
476 *
477 * @param object $parsers HTML parser
478 * @param string $name tag name
479 * @return boolean
480 * @access private
481 */
482 function _closeHandler(&$parser, $name)
483 {
484
485 $name = strtolower($name);
486
487 if (isset($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) &&
488 (in_array($name, $this->deleteTagsContent)))
489 {
490 while ($name != ($tag = array_pop($this->_dcStack))) {
491 $this->_dcCounter[$tag]--;
492 }
493
494 $this->_dcCounter[$name]--;
495 }
496
497 if (count($this->_dcStack) != 0) {
498 return true;
499 }
500
501 if ((isset($this->_counter[$name])) && ($this->_counter[$name] > 0)) {
502 while ($name != ($tag = array_pop($this->_stack))) {
503 $this->_closeTag($tag);
504 }
505
506 $this->_closeTag($name);
507 }
508 return true;
509 }
510
511 /**
512 * Closes tag
513 *
514 * @param string $tag tag name
515 * @return boolean
516 * @access private
517 */
518 function _closeTag($tag)
519 {
520 if (!in_array($tag, $this->noClose)) {
521 $this->_xhtml .= '</' . $tag . '>';
522 }
523
524 $this->_counter[$tag]--;
525
526 if (in_array($tag, $this->listTags)) {
527 $this->_listScope--;
528 }
529
530 if ($tag == 'li') {
531 array_pop($this->_liStack);
532 }
533 return true;
534 }
535
536 /**
537 * Character data handler - called from HTMLSax
538 *
539 * @param object $parser HTML parser
540 * @param string $data textual data
541 * @return boolean
542 * @access private
543 */
544 function _dataHandler(&$parser, $data)
545 {
546 if (count($this->_dcStack) == 0) {
547 $this->_xhtml .= $data;
548 }
549 return true;
550 }
551
552 /**
553 * Escape handler - called from HTMLSax
554 *
555 * @param object $parser HTML parser
556 * @param string $data comments or other type of data
557 * @return boolean
558 * @access private
559 */
560 function _escapeHandler(&$parser, $data)
561 {
562 return true;
563 }
564
565 /**
566 * Returns the XHTML document
567 *
568 * @return string Processed (X)HTML document
569 * @access public
570 */
571 function getXHTML ()
572 {
573 while ($tag = array_pop($this->_stack)) {
574 $this->_closeTag($tag);
575 }
576
577 return $this->_xhtml;
578 }
579
580 /**
581 * Clears current document data
582 *
583 * @return boolean
584 * @access public
585 */
586 function clear()
587 {
588 $this->_xhtml = '';
589 return true;
590 }
591
592 /**
593 * Main parsing fuction
594 *
595 * @param string $doc HTML document for processing
596 * @return string Processed (X)HTML document
597 * @access public
598 */
599 function parse($doc)
600 {
601
602 // Save all '<' symbols
603 $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '&lt;', $doc);
604
605 // Web documents shouldn't contains \x00 symbol
606 $doc = str_replace("\x00", '', $doc);
607
608 // Opera6 bug workaround
609 $doc = str_replace("\xC0\xBC", '&lt;', $doc);
610
611 if ($this->repackUTF7) {
612 // UTF-7 encoding ASCII decode
613 $doc = $this->repackUTF7($doc);
614 }
615
616 // Instantiate the parser
617 $parser = new XML_HTMLSax3();
618
619 // Set up the parser
620 $parser->set_object($this);
621
622 $parser->set_element_handler('_openHandler','_closeHandler');
623 $parser->set_data_handler('_dataHandler');
624 $parser->set_escape_handler('_escapeHandler');
625
626 $parser->parse($doc);
627
628 return $this->getXHTML();
629
630 }
631
632
633 /**
634 * UTF-7 decoding fuction
635 *
636 * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
637 * @return string Decoded document
638 * @access private
639 */
640 function repackUTF7($str)
641 {
642 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
643 }
644
645 /**
646 * Additional UTF-7 decoding fuction
647 *
648 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
649 * @return string Recoded string
650 * @access private
651 */
652 function repackUTF7Callback($str)
653 {
654 $str = base64_decode($str[1]);
655 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
656 return preg_replace('/\x00(.)/', '$1', $str);
657 }
658
659 /**
660 * Additional UTF-7 encoding fuction
661 *
662 * @param string $str String for recode ASCII part of UTF-7 back to ASCII
663 * @return string Recoded string
664 * @access private
665 */
666 function repackUTF7Back($str)
667 {
668 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
669 }
670 }