[PLUGINS] ~maj globale
[lhc/web/www.git] / www / plugins / facteur / phpmailer-php5 / extras / htmlfilter.php
1 <?php
2 /**
3 * htmlfilter.inc
4 * ---------------
5 * This set of functions allows you to filter html in order to remove
6 * any malicious tags from it. Useful in cases when you need to filter
7 * user input for any cross-site-scripting attempts.
8 *
9 * Copyright (C) 2002-2004 by Duke University
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24 * 02110-1301 USA
25 *
26 * @Author Konstantin Riabitsev <icon@linux.duke.edu>
27 * @Author Jim Jagielski <jim@jaguNET.com / jimjag@gmail.com>
28 * @Version 1.1 ($Date$)
29 */
30
31 /**
32 * This function returns the final tag out of the tag name, an array
33 * of attributes, and the type of the tag. This function is called by
34 * tln_sanitize internally.
35 *
36 * @param string $tagname the name of the tag.
37 * @param array $attary the array of attributes and their values
38 * @param integer $tagtype The type of the tag (see in comments).
39 * @return string A string with the final tag representation.
40 */
41 function tln_tagprint($tagname, $attary, $tagtype)
42 {
43 if ($tagtype == 2) {
44 $fulltag = '</' . $tagname . '>';
45 } else {
46 $fulltag = '<' . $tagname;
47 if (is_array($attary) && sizeof($attary)) {
48 $atts = array();
49 while (list($attname, $attvalue) = each($attary)) {
50 array_push($atts, "$attname=$attvalue");
51 }
52 $fulltag .= ' ' . join(' ', $atts);
53 }
54 if ($tagtype == 3) {
55 $fulltag .= ' /';
56 }
57 $fulltag .= '>';
58 }
59 return $fulltag;
60 }
61
62 /**
63 * A small helper function to use with array_walk. Modifies a by-ref
64 * value and makes it lowercase.
65 *
66 * @param string $val a value passed by-ref.
67 * @return void since it modifies a by-ref value.
68 */
69 function tln_casenormalize(&$val)
70 {
71 $val = strtolower($val);
72 }
73
74 /**
75 * This function skips any whitespace from the current position within
76 * a string and to the next non-whitespace value.
77 *
78 * @param string $body the string
79 * @param integer $offset the offset within the string where we should start
80 * looking for the next non-whitespace character.
81 * @return integer the location within the $body where the next
82 * non-whitespace char is located.
83 */
84 function tln_skipspace($body, $offset)
85 {
86 preg_match('/^(\s*)/s', substr($body, $offset), $matches);
87 if (sizeof($matches[1])) {
88 $count = strlen($matches[1]);
89 $offset += $count;
90 }
91 return $offset;
92 }
93
94 /**
95 * This function looks for the next character within a string. It's
96 * really just a glorified "strpos", except it catches the failures
97 * nicely.
98 *
99 * @param string $body The string to look for needle in.
100 * @param integer $offset Start looking from this position.
101 * @param string $needle The character/string to look for.
102 * @return integer location of the next occurrence of the needle, or
103 * strlen($body) if needle wasn't found.
104 */
105 function tln_findnxstr($body, $offset, $needle)
106 {
107 $pos = strpos($body, $needle, $offset);
108 if ($pos === false) {
109 $pos = strlen($body);
110 }
111 return $pos;
112 }
113
114 /**
115 * This function takes a PCRE-style regexp and tries to match it
116 * within the string.
117 *
118 * @param string $body The string to look for needle in.
119 * @param integer $offset Start looking from here.
120 * @param string $reg A PCRE-style regex to match.
121 * @return array|boolean Returns a false if no matches found, or an array
122 * with the following members:
123 * - integer with the location of the match within $body
124 * - string with whatever content between offset and the match
125 * - string with whatever it is we matched
126 */
127 function tln_findnxreg($body, $offset, $reg)
128 {
129 $matches = array();
130 $retarr = array();
131 $preg_rule = '%^(.*?)(' . $reg . ')%s';
132 preg_match($preg_rule, substr($body, $offset), $matches);
133 if (!isset($matches[0]) || !$matches[0]) {
134 $retarr = false;
135 } else {
136 $retarr[0] = $offset + strlen($matches[1]);
137 $retarr[1] = $matches[1];
138 $retarr[2] = $matches[2];
139 }
140 return $retarr;
141 }
142
143 /**
144 * This function looks for the next tag.
145 *
146 * @param string $body String where to look for the next tag.
147 * @param integer $offset Start looking from here.
148 * @return array|boolean false if no more tags exist in the body, or
149 * an array with the following members:
150 * - string with the name of the tag
151 * - array with attributes and their values
152 * - integer with tag type (1, 2, or 3)
153 * - integer where the tag starts (starting "<")
154 * - integer where the tag ends (ending ">")
155 * first three members will be false, if the tag is invalid.
156 */
157 function tln_getnxtag($body, $offset)
158 {
159 if ($offset > strlen($body)) {
160 return false;
161 }
162 $lt = tln_findnxstr($body, $offset, '<');
163 if ($lt == strlen($body)) {
164 return false;
165 }
166 /**
167 * We are here:
168 * blah blah <tag attribute="value">
169 * \---------^
170 */
171 $pos = tln_skipspace($body, $lt + 1);
172 if ($pos >= strlen($body)) {
173 return array(false, false, false, $lt, strlen($body));
174 }
175 /**
176 * There are 3 kinds of tags:
177 * 1. Opening tag, e.g.:
178 * <a href="blah">
179 * 2. Closing tag, e.g.:
180 * </a>
181 * 3. XHTML-style content-less tag, e.g.:
182 * <img src="blah"/>
183 */
184 switch (substr($body, $pos, 1)) {
185 case '/':
186 $tagtype = 2;
187 $pos++;
188 break;
189 case '!':
190 /**
191 * A comment or an SGML declaration.
192 */
193 if (substr($body, $pos + 1, 2) == '--') {
194 $gt = strpos($body, '-->', $pos);
195 if ($gt === false) {
196 $gt = strlen($body);
197 } else {
198 $gt += 2;
199 }
200 return array(false, false, false, $lt, $gt);
201 } else {
202 $gt = tln_findnxstr($body, $pos, '>');
203 return array(false, false, false, $lt, $gt);
204 }
205 break;
206 default:
207 /**
208 * Assume tagtype 1 for now. If it's type 3, we'll switch values
209 * later.
210 */
211 $tagtype = 1;
212 break;
213 }
214
215 /**
216 * Look for next [\W-_], which will indicate the end of the tag name.
217 */
218 $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
219 if ($regary == false) {
220 return array(false, false, false, $lt, strlen($body));
221 }
222 list($pos, $tagname, $match) = $regary;
223 $tagname = strtolower($tagname);
224
225 /**
226 * $match can be either of these:
227 * '>' indicating the end of the tag entirely.
228 * '\s' indicating the end of the tag name.
229 * '/' indicating that this is type-3 xhtml tag.
230 *
231 * Whatever else we find there indicates an invalid tag.
232 */
233 switch ($match) {
234 case '/':
235 /**
236 * This is an xhtml-style tag with a closing / at the
237 * end, like so: <img src="blah"/>. Check if it's followed
238 * by the closing bracket. If not, then this tag is invalid
239 */
240 if (substr($body, $pos, 2) == '/>') {
241 $pos++;
242 $tagtype = 3;
243 } else {
244 $gt = tln_findnxstr($body, $pos, '>');
245 $retary = array(false, false, false, $lt, $gt);
246 return $retary;
247 }
248 //intentional fall-through
249 case '>':
250 return array($tagname, false, $tagtype, $lt, $pos);
251 break;
252 default:
253 /**
254 * Check if it's whitespace
255 */
256 if (!preg_match('/\s/', $match)) {
257 /**
258 * This is an invalid tag! Look for the next closing ">".
259 */
260 $gt = tln_findnxstr($body, $lt, '>');
261 return array(false, false, false, $lt, $gt);
262 }
263 break;
264 }
265
266 /**
267 * At this point we're here:
268 * <tagname attribute='blah'>
269 * \-------^
270 *
271 * At this point we loop in order to find all attributes.
272 */
273 $attary = array();
274
275 while ($pos <= strlen($body)) {
276 $pos = tln_skipspace($body, $pos);
277 if ($pos == strlen($body)) {
278 /**
279 * Non-closed tag.
280 */
281 return array(false, false, false, $lt, $pos);
282 }
283 /**
284 * See if we arrived at a ">" or "/>", which means that we reached
285 * the end of the tag.
286 */
287 $matches = array();
288 if (preg_match('%^(\s*)(>|/>)%s', substr($body, $pos), $matches)) {
289 /**
290 * Yep. So we did.
291 */
292 $pos += strlen($matches[1]);
293 if ($matches[2] == '/>') {
294 $tagtype = 3;
295 $pos++;
296 }
297 return array($tagname, $attary, $tagtype, $lt, $pos);
298 }
299
300 /**
301 * There are several types of attributes, with optional
302 * [:space:] between members.
303 * Type 1:
304 * attrname[:space:]=[:space:]'CDATA'
305 * Type 2:
306 * attrname[:space:]=[:space:]"CDATA"
307 * Type 3:
308 * attr[:space:]=[:space:]CDATA
309 * Type 4:
310 * attrname
311 *
312 * We leave types 1 and 2 the same, type 3 we check for
313 * '"' and convert to "&quot" if needed, then wrap in
314 * double quotes. Type 4 we convert into:
315 * attrname="yes".
316 */
317 $regary = tln_findnxreg($body, $pos, '[^\w\-_]');
318 if ($regary == false) {
319 /**
320 * Looks like body ended before the end of tag.
321 */
322 return array(false, false, false, $lt, strlen($body));
323 }
324 list($pos, $attname, $match) = $regary;
325 $attname = strtolower($attname);
326 /**
327 * We arrived at the end of attribute name. Several things possible
328 * here:
329 * '>' means the end of the tag and this is attribute type 4
330 * '/' if followed by '>' means the same thing as above
331 * '\s' means a lot of things -- look what it's followed by.
332 * anything else means the attribute is invalid.
333 */
334 switch ($match) {
335 case '/':
336 /**
337 * This is an xhtml-style tag with a closing / at the
338 * end, like so: <img src="blah"/>. Check if it's followed
339 * by the closing bracket. If not, then this tag is invalid
340 */
341 if (substr($body, $pos, 2) == '/>') {
342 $pos++;
343 $tagtype = 3;
344 } else {
345 $gt = tln_findnxstr($body, $pos, '>');
346 $retary = array(false, false, false, $lt, $gt);
347 return $retary;
348 }
349 //intentional fall-through
350 case '>':
351 $attary{$attname} = '"yes"';
352 return array($tagname, $attary, $tagtype, $lt, $pos);
353 break;
354 default:
355 /**
356 * Skip whitespace and see what we arrive at.
357 */
358 $pos = tln_skipspace($body, $pos);
359 $char = substr($body, $pos, 1);
360 /**
361 * Two things are valid here:
362 * '=' means this is attribute type 1 2 or 3.
363 * \w means this was attribute type 4.
364 * anything else we ignore and re-loop. End of tag and
365 * invalid stuff will be caught by our checks at the beginning
366 * of the loop.
367 */
368 if ($char == '=') {
369 $pos++;
370 $pos = tln_skipspace($body, $pos);
371 /**
372 * Here are 3 possibilities:
373 * "'" attribute type 1
374 * '"' attribute type 2
375 * everything else is the content of tag type 3
376 */
377 $quot = substr($body, $pos, 1);
378 if ($quot == '\'') {
379 $regary = tln_findnxreg($body, $pos + 1, '\'');
380 if ($regary == false) {
381 return array(false, false, false, $lt, strlen($body));
382 }
383 list($pos, $attval, $match) = $regary;
384 $pos++;
385 $attary{$attname} = '\'' . $attval . '\'';
386 } elseif ($quot == '"') {
387 $regary = tln_findnxreg($body, $pos + 1, '\"');
388 if ($regary == false) {
389 return array(false, false, false, $lt, strlen($body));
390 }
391 list($pos, $attval, $match) = $regary;
392 $pos++;
393 $attary{$attname} = '"' . $attval . '"';
394 } else {
395 /**
396 * These are hateful. Look for \s, or >.
397 */
398 $regary = tln_findnxreg($body, $pos, '[\s>]');
399 if ($regary == false) {
400 return array(false, false, false, $lt, strlen($body));
401 }
402 list($pos, $attval, $match) = $regary;
403 /**
404 * If it's ">" it will be caught at the top.
405 */
406 $attval = preg_replace('/\"/s', '&quot;', $attval);
407 $attary{$attname} = '"' . $attval . '"';
408 }
409 } elseif (preg_match('|[\w/>]|', $char)) {
410 /**
411 * That was attribute type 4.
412 */
413 $attary{$attname} = '"yes"';
414 } else {
415 /**
416 * An illegal character. Find next '>' and return.
417 */
418 $gt = tln_findnxstr($body, $pos, '>');
419 return array(false, false, false, $lt, $gt);
420 }
421 break;
422 }
423 }
424 /**
425 * The fact that we got here indicates that the tag end was never
426 * found. Return invalid tag indication so it gets stripped.
427 */
428 return array(false, false, false, $lt, strlen($body));
429 }
430
431 /**
432 * Translates entities into literal values so they can be checked.
433 *
434 * @param string $attvalue the by-ref value to check.
435 * @param string $regex the regular expression to check against.
436 * @param boolean $hex whether the entites are hexadecimal.
437 * @return boolean True or False depending on whether there were matches.
438 */
439 function tln_deent(&$attvalue, $regex, $hex = false)
440 {
441 preg_match_all($regex, $attvalue, $matches);
442 if (is_array($matches) && sizeof($matches[0]) > 0) {
443 $repl = array();
444 for ($i = 0; $i < sizeof($matches[0]); $i++) {
445 $numval = $matches[1][$i];
446 if ($hex) {
447 $numval = hexdec($numval);
448 }
449 $repl{$matches[0][$i]} = chr($numval);
450 }
451 $attvalue = strtr($attvalue, $repl);
452 return true;
453 } else {
454 return false;
455 }
456 }
457
458 /**
459 * This function checks attribute values for entity-encoded values
460 * and returns them translated into 8-bit strings so we can run
461 * checks on them.
462 *
463 * @param string $attvalue A string to run entity check against.
464 * @return Void, modifies a reference value.
465 */
466 function tln_defang(&$attvalue)
467 {
468 /**
469 * Skip this if there aren't ampersands or backslashes.
470 */
471 if (strpos($attvalue, '&') === false
472 && strpos($attvalue, '\\') === false
473 ) {
474 return;
475 }
476 do {
477 $m = false;
478 $m = $m || tln_deent($attvalue, '/\&#0*(\d+);*/s');
479 $m = $m || tln_deent($attvalue, '/\&#x0*((\d|[a-f])+);*/si', true);
480 $m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true);
481 } while ($m == true);
482 $attvalue = stripslashes($attvalue);
483 }
484
485 /**
486 * Kill any tabs, newlines, or carriage returns. Our friends the
487 * makers of the browser with 95% market value decided that it'd
488 * be funny to make "java[tab]script" be just as good as "javascript".
489 *
490 * @param string $attvalue The attribute value before extraneous spaces removed.
491 * @return Void, modifies a reference value.
492 */
493 function tln_unspace(&$attvalue)
494 {
495 if (strcspn($attvalue, "\t\r\n\0 ") != strlen($attvalue)) {
496 $attvalue = str_replace(
497 array("\t", "\r", "\n", "\0", " "),
498 array('', '', '', '', ''),
499 $attvalue
500 );
501 }
502 }
503
504 /**
505 * This function runs various checks against the attributes.
506 *
507 * @param string $tagname String with the name of the tag.
508 * @param array $attary Array with all tag attributes.
509 * @param array $rm_attnames See description for tln_sanitize
510 * @param array $bad_attvals See description for tln_sanitize
511 * @param array $add_attr_to_tag See description for tln_sanitize
512 * @param string $trans_image_path
513 * @param boolean $block_external_images
514 * @return Array with modified attributes.
515 */
516 function tln_fixatts(
517 $tagname,
518 $attary,
519 $rm_attnames,
520 $bad_attvals,
521 $add_attr_to_tag,
522 $trans_image_path,
523 $block_external_images
524 ) {
525 while (list($attname, $attvalue) = each($attary)) {
526 /**
527 * See if this attribute should be removed.
528 */
529 foreach ($rm_attnames as $matchtag => $matchattrs) {
530 if (preg_match($matchtag, $tagname)) {
531 foreach ($matchattrs as $matchattr) {
532 if (preg_match($matchattr, $attname)) {
533 unset($attary{$attname});
534 continue;
535 }
536 }
537 }
538 }
539 /**
540 * Remove any backslashes, entities, or extraneous whitespace.
541 */
542 $oldattvalue = $attvalue;
543 tln_defang($attvalue);
544 if ($attname == 'style' && $attvalue !== $oldattvalue) {
545 $attvalue = "idiocy";
546 $attary{$attname} = $attvalue;
547 }
548 tln_unspace($attvalue);
549
550 /**
551 * Now let's run checks on the attvalues.
552 * I don't expect anyone to comprehend this. If you do,
553 * get in touch with me so I can drive to where you live and
554 * shake your hand personally. :)
555 */
556 foreach ($bad_attvals as $matchtag => $matchattrs) {
557 if (preg_match($matchtag, $tagname)) {
558 foreach ($matchattrs as $matchattr => $valary) {
559 if (preg_match($matchattr, $attname)) {
560 /**
561 * There are two arrays in valary.
562 * First is matches.
563 * Second one is replacements
564 */
565 list($valmatch, $valrepl) = $valary;
566 $newvalue = preg_replace($valmatch, $valrepl, $attvalue);
567 if ($newvalue != $attvalue) {
568 $attary{$attname} = $newvalue;
569 $attvalue = $newvalue;
570 }
571 }
572 }
573 }
574 }
575 if ($attname == 'style') {
576 if (preg_match('/[\0-\37\200-\377]+/', $attvalue)) {
577 $attary{$attname} = '"disallowed character"';
578 }
579 preg_match_all("/url\s*\((.+)\)/si", $attvalue, $aMatch);
580 if (count($aMatch)) {
581 foreach($aMatch[1] as $sMatch) {
582 $urlvalue = $sMatch;
583 tln_fixurl($attname, $urlvalue, $trans_image_path, $block_external_images);
584 $attary{$attname} = str_replace($sMatch, $urlvalue, $attvalue);
585 }
586 }
587 }
588 }
589 /**
590 * See if we need to append any attributes to this tag.
591 */
592 foreach ($add_attr_to_tag as $matchtag => $addattary) {
593 if (preg_match($matchtag, $tagname)) {
594 $attary = array_merge($attary, $addattary);
595 }
596 }
597 return $attary;
598 }
599
600 function tln_fixurl($attname, &$attvalue, $trans_image_path, $block_external_images)
601 {
602 $sQuote = '"';
603 $attvalue = trim($attvalue);
604 if ($attvalue && ($attvalue[0] =='"'|| $attvalue[0] == "'")) {
605 // remove the double quotes
606 $sQuote = $attvalue[0];
607 $attvalue = trim(substr($attvalue,1,-1));
608 }
609
610 /**
611 * Replace empty src tags with the blank image. src is only used
612 * for frames, images, and image inputs. Doing a replace should
613 * not affect them working as should be, however it will stop
614 * IE from being kicked off when src for img tags are not set
615 */
616 if ($attvalue == '') {
617 $attvalue = $sQuote . $trans_image_path . $sQuote;
618 } else {
619 // first, disallow 8 bit characters and control characters
620 if (preg_match('/[\0-\37\200-\377]+/',$attvalue)) {
621 switch ($attname) {
622 case 'href':
623 $attvalue = $sQuote . 'http://invalid-stuff-detected.example.com' . $sQuote;
624 break;
625 default:
626 $attvalue = $sQuote . $trans_image_path . $sQuote;
627 break;
628 }
629 } else {
630 $aUrl = parse_url($attvalue);
631 if (isset($aUrl['scheme'])) {
632 switch(strtolower($aUrl['scheme'])) {
633 case 'mailto':
634 case 'http':
635 case 'https':
636 case 'ftp':
637 if ($attname != 'href') {
638 if ($block_external_images == true) {
639 $attvalue = $sQuote . $trans_image_path . $sQuote;
640 } else {
641 if (!isset($aUrl['path'])) {
642 $attvalue = $sQuote . $trans_image_path . $sQuote;
643 }
644 }
645 } else {
646 $attvalue = $sQuote . $attvalue . $sQuote;
647 }
648 break;
649 case 'outbind':
650 $attvalue = $sQuote . $attvalue . $sQuote;
651 break;
652 case 'cid':
653 $attvalue = $sQuote . $attvalue . $sQuote;
654 break;
655 default:
656 $attvalue = $sQuote . $trans_image_path . $sQuote;
657 break;
658 }
659 } else {
660 if (!isset($aUrl['path']) || $aUrl['path'] != $trans_image_path) {
661 $$attvalue = $sQuote . $trans_image_path . $sQuote;
662 }
663 }
664 }
665 }
666 }
667
668 function tln_fixstyle($body, $pos, $trans_image_path, $block_external_images)
669 {
670 $me = 'tln_fixstyle';
671 // workaround for </style> in between comments
672 $iCurrentPos = $pos;
673 $content = '';
674 $sToken = '';
675 $bSucces = false;
676 $bEndTag = false;
677 for ($i=$pos,$iCount=strlen($body);$i<$iCount;++$i) {
678 $char = $body{$i};
679 switch ($char) {
680 case '<':
681 $sToken = $char;
682 break;
683 case '/':
684 if ($sToken == '<') {
685 $sToken .= $char;
686 $bEndTag = true;
687 } else {
688 $content .= $char;
689 }
690 break;
691 case '>':
692 if ($bEndTag) {
693 $sToken .= $char;
694 if (preg_match('/\<\/\s*style\s*\>/i',$sToken,$aMatch)) {
695 $newpos = $i + 1;
696 $bSucces = true;
697 break 2;
698 } else {
699 $content .= $sToken;
700 }
701 $bEndTag = false;
702 } else {
703 $content .= $char;
704 }
705 break;
706 case '!':
707 if ($sToken == '<') {
708 // possible comment
709 if (isset($body{$i+2}) && substr($body,$i,3) == '!--') {
710 $i = strpos($body,'-->',$i+3);
711 if ($i === false) { // no end comment
712 $i = strlen($body);
713 }
714 $sToken = '';
715 }
716 } else {
717 $content .= $char;
718 }
719 break;
720 default:
721 if ($bEndTag) {
722 $sToken .= $char;
723 } else {
724 $content .= $char;
725 }
726 break;
727 }
728 }
729 if ($bSucces == FALSE){
730 return array(FALSE, strlen($body));
731 }
732
733
734
735 /**
736 * First look for general BODY style declaration, which would be
737 * like so:
738 * body {background: blah-blah}
739 * and change it to .bodyclass so we can just assign it to a <div>
740 */
741 $content = preg_replace("|body(\s*\{.*?\})|si", ".bodyclass\\1", $content);
742
743 $trans_image_path = $trans_image_path;
744
745 /**
746 * Fix url('blah') declarations.
747 */
748 // $content = preg_replace("|url\s*\(\s*([\'\"])\s*\S+script\s*:.*?([\'\"])\s*\)|si",
749 // "url(\\1$trans_image_path\\2)", $content);
750
751 // first check for 8bit sequences and disallowed control characters
752 if (preg_match('/[\16-\37\200-\377]+/',$content)) {
753 $content = '<!-- style block removed by html filter due to presence of 8bit characters -->';
754 return array($content, $newpos);
755 }
756
757 // remove @import line
758 $content = preg_replace("/^\s*(@import.*)$/mi","\n<!-- @import rules forbidden -->\n",$content);
759
760 $content = preg_replace("/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i", 'url', $content);
761 preg_match_all("/url\s*\((.+)\)/si",$content,$aMatch);
762 if (count($aMatch)) {
763 $aValue = $aReplace = array();
764 foreach($aMatch[1] as $sMatch) {
765 // url value
766 $urlvalue = $sMatch;
767 tln_fixurl('style',$urlvalue, $trans_image_path, $block_external_images);
768 $aValue[] = $sMatch;
769 $aReplace[] = $urlvalue;
770 }
771 $content = str_replace($aValue,$aReplace,$content);
772 }
773
774 /**
775 * Remove any backslashes, entities, and extraneous whitespace.
776 */
777 $contentTemp = $content;
778 tln_defang($contentTemp);
779 tln_unspace($contentTemp);
780
781 $match = Array('/\/\*.*\*\//',
782 '/expression/i',
783 '/behaviou*r/i',
784 '/binding/i',
785 '/include-source/i',
786 '/javascript/i',
787 '/script/i',
788 '/position/i');
789 $replace = Array('','idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', '');
790 $contentNew = preg_replace($match, $replace, $contentTemp);
791 if ($contentNew !== $contentTemp) {
792 $content = $contentNew;
793 }
794 return array($content, $newpos);
795 }
796
797 function tln_body2div($attary, $trans_image_path)
798 {
799 $me = 'tln_body2div';
800 $divattary = array('class' => "'bodyclass'");
801 $text = '#000000';
802 $has_bgc_stl = $has_txt_stl = false;
803 $styledef = '';
804 if (is_array($attary) && sizeof($attary) > 0){
805 foreach ($attary as $attname=>$attvalue){
806 $quotchar = substr($attvalue, 0, 1);
807 $attvalue = str_replace($quotchar, "", $attvalue);
808 switch ($attname){
809 case 'background':
810 $styledef .= "background-image: url('$trans_image_path'); ";
811 break;
812 case 'bgcolor':
813 $has_bgc_stl = true;
814 $styledef .= "background-color: $attvalue; ";
815 break;
816 case 'text':
817 $has_txt_stl = true;
818 $styledef .= "color: $attvalue; ";
819 break;
820 }
821 }
822 // Outlook defines a white bgcolor and no text color. This can lead to
823 // white text on a white bg with certain themes.
824 if ($has_bgc_stl && !$has_txt_stl) {
825 $styledef .= "color: $text; ";
826 }
827 if (strlen($styledef) > 0){
828 $divattary{"style"} = "\"$styledef\"";
829 }
830 }
831 return $divattary;
832 }
833
834 /**
835 *
836 * @param string $body The HTML you wish to filter
837 * @param array $tag_list see description above
838 * @param array $rm_tags_with_content see description above
839 * @param array $self_closing_tags see description above
840 * @param boolean $force_tag_closing see description above
841 * @param array $rm_attnames see description above
842 * @param array $bad_attvals see description above
843 * @param array $add_attr_to_tag see description above
844 * @param string $trans_image_path
845 * @param boolean $block_external_images
846
847 * @return string Sanitized html safe to show on your pages.
848 */
849 function tln_sanitize(
850 $body,
851 $tag_list,
852 $rm_tags_with_content,
853 $self_closing_tags,
854 $force_tag_closing,
855 $rm_attnames,
856 $bad_attvals,
857 $add_attr_to_tag,
858 $trans_image_path,
859 $block_external_images
860 ) {
861 /**
862 * Normalize rm_tags and rm_tags_with_content.
863 */
864 $rm_tags = array_shift($tag_list);
865 @array_walk($tag_list, 'tln_casenormalize');
866 @array_walk($rm_tags_with_content, 'tln_casenormalize');
867 @array_walk($self_closing_tags, 'tln_casenormalize');
868 /**
869 * See if tag_list is of tags to remove or tags to allow.
870 * false means remove these tags
871 * true means allow these tags
872 */
873 $curpos = 0;
874 $open_tags = array();
875 $trusted = "<!-- begin tln_sanitized html -->\n";
876 $skip_content = false;
877 /**
878 * Take care of netscape's stupid javascript entities like
879 * &{alert('boo')};
880 */
881 $body = preg_replace('/&(\{.*?\};)/si', '&amp;\\1', $body);
882 while (($curtag = tln_getnxtag($body, $curpos)) != false) {
883 list($tagname, $attary, $tagtype, $lt, $gt) = $curtag;
884 $free_content = substr($body, $curpos, $lt-$curpos);
885 /**
886 * Take care of <style>
887 */
888 if ($tagname == "style" && $tagtype == 1){
889 list($free_content, $curpos) =
890 tln_fixstyle($body, $gt+1, $trans_image_path, $block_external_images);
891 if ($free_content != FALSE){
892 if ( !empty($attary) ) {
893 $attary = tln_fixatts($tagname,
894 $attary,
895 $rm_attnames,
896 $bad_attvals,
897 $add_attr_to_tag,
898 $trans_image_path,
899 $block_external_images
900 );
901 }
902 $trusted .= tln_tagprint($tagname, $attary, $tagtype);
903 $trusted .= $free_content;
904 $trusted .= tln_tagprint($tagname, false, 2);
905 }
906 continue;
907 }
908 if ($skip_content == false){
909 $trusted .= $free_content;
910 }
911 if ($tagname != false) {
912 if ($tagtype == 2) {
913 if ($skip_content == $tagname) {
914 /**
915 * Got to the end of tag we needed to remove.
916 */
917 $tagname = false;
918 $skip_content = false;
919 } else {
920 if ($skip_content == false) {
921 if ($tagname == "body") {
922 $tagname = "div";
923 }
924 if (isset($open_tags{$tagname}) &&
925 $open_tags{$tagname} > 0
926 ) {
927 $open_tags{$tagname}--;
928 } else {
929 $tagname = false;
930 }
931 }
932 }
933 } else {
934 /**
935 * $rm_tags_with_content
936 */
937 if ($skip_content == false) {
938 /**
939 * See if this is a self-closing type and change
940 * tagtype appropriately.
941 */
942 if ($tagtype == 1
943 && in_array($tagname, $self_closing_tags)
944 ) {
945 $tagtype = 3;
946 }
947 /**
948 * See if we should skip this tag and any content
949 * inside it.
950 */
951 if ($tagtype == 1
952 && in_array($tagname, $rm_tags_with_content)
953 ) {
954 $skip_content = $tagname;
955 } else {
956 if (($rm_tags == false
957 && in_array($tagname, $tag_list)) ||
958 ($rm_tags == true
959 && !in_array($tagname, $tag_list))
960 ) {
961 $tagname = false;
962 } else {
963 /**
964 * Convert body into div.
965 */
966 if ($tagname == "body"){
967 $tagname = "div";
968 $attary = tln_body2div($attary, $trans_image_path);
969 }
970 if ($tagtype == 1) {
971 if (isset($open_tags{$tagname})) {
972 $open_tags{$tagname}++;
973 } else {
974 $open_tags{$tagname} = 1;
975 }
976 }
977 /**
978 * This is where we run other checks.
979 */
980 if (is_array($attary) && sizeof($attary) > 0) {
981 $attary = tln_fixatts(
982 $tagname,
983 $attary,
984 $rm_attnames,
985 $bad_attvals,
986 $add_attr_to_tag,
987 $trans_image_path,
988 $block_external_images
989 );
990 }
991 }
992 }
993 }
994 }
995 if ($tagname != false && $skip_content == false) {
996 $trusted .= tln_tagprint($tagname, $attary, $tagtype);
997 }
998 }
999 $curpos = $gt + 1;
1000 }
1001 $trusted .= substr($body, $curpos, strlen($body) - $curpos);
1002 if ($force_tag_closing == true) {
1003 foreach ($open_tags as $tagname => $opentimes) {
1004 while ($opentimes > 0) {
1005 $trusted .= '</' . $tagname . '>';
1006 $opentimes--;
1007 }
1008 }
1009 $trusted .= "\n";
1010 }
1011 $trusted .= "<!-- end tln_sanitized html -->\n";
1012 return $trusted;
1013 }
1014
1015 //
1016 // Use the nifty htmlfilter library
1017 //
1018
1019
1020 function HTMLFilter($body, $trans_image_path, $block_external_images = false)
1021 {
1022
1023 $tag_list = array(
1024 false,
1025 "object",
1026 "meta",
1027 "html",
1028 "head",
1029 "base",
1030 "link",
1031 "frame",
1032 "iframe",
1033 "plaintext",
1034 "marquee"
1035 );
1036
1037 $rm_tags_with_content = array(
1038 "script",
1039 "applet",
1040 "embed",
1041 "title",
1042 "frameset",
1043 "xmp",
1044 "xml"
1045 );
1046
1047 $self_closing_tags = array(
1048 "img",
1049 "br",
1050 "hr",
1051 "input",
1052 "outbind"
1053 );
1054
1055 $force_tag_closing = true;
1056
1057 $rm_attnames = array(
1058 "/.*/" =>
1059 array(
1060 // "/target/i",
1061 "/^on.*/i",
1062 "/^dynsrc/i",
1063 "/^data.*/i",
1064 "/^lowsrc.*/i"
1065 )
1066 );
1067
1068 $bad_attvals = array(
1069 "/.*/" =>
1070 array(
1071 "/^src|background/i" =>
1072 array(
1073 array(
1074 '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
1075 '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
1076 '/^([\'"])\s*about\s*:.*([\'"])/si'
1077 ),
1078 array(
1079 "\\1$trans_image_path\\2",
1080 "\\1$trans_image_path\\2",
1081 "\\1$trans_image_path\\2"
1082 )
1083 ),
1084 "/^href|action/i" =>
1085 array(
1086 array(
1087 '/^([\'"])\s*\S+script\s*:.*([\'"])/si',
1088 '/^([\'"])\s*mocha\s*:*.*([\'"])/si',
1089 '/^([\'"])\s*about\s*:.*([\'"])/si'
1090 ),
1091 array(
1092 "\\1#\\1",
1093 "\\1#\\1",
1094 "\\1#\\1"
1095 )
1096 ),
1097 "/^style/i" =>
1098 array(
1099 array(
1100 "/\/\*.*\*\//",
1101 "/expression/i",
1102 "/binding/i",
1103 "/behaviou*r/i",
1104 "/include-source/i",
1105 '/position\s*:/i',
1106 '/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i',
1107 '/url\s*\(\s*([\'"])\s*\S+script\s*:.*([\'"])\s*\)/si',
1108 '/url\s*\(\s*([\'"])\s*mocha\s*:.*([\'"])\s*\)/si',
1109 '/url\s*\(\s*([\'"])\s*about\s*:.*([\'"])\s*\)/si',
1110 '/(.*)\s*:\s*url\s*\(\s*([\'"]*)\s*\S+script\s*:.*([\'"]*)\s*\)/si'
1111 ),
1112 array(
1113 "",
1114 "idiocy",
1115 "idiocy",
1116 "idiocy",
1117 "idiocy",
1118 "idiocy",
1119 "url",
1120 "url(\\1#\\1)",
1121 "url(\\1#\\1)",
1122 "url(\\1#\\1)",
1123 "\\1:url(\\2#\\3)"
1124 )
1125 )
1126 )
1127 );
1128
1129 if ($block_external_images) {
1130 array_push(
1131 $bad_attvals{'/.*/'}{'/^src|background/i'}[0],
1132 '/^([\'\"])\s*https*:.*([\'\"])/si'
1133 );
1134 array_push(
1135 $bad_attvals{'/.*/'}{'/^src|background/i'}[1],
1136 "\\1$trans_image_path\\1"
1137 );
1138 array_push(
1139 $bad_attvals{'/.*/'}{'/^style/i'}[0],
1140 '/url\(([\'\"])\s*https*:.*([\'\"])\)/si'
1141 );
1142 array_push(
1143 $bad_attvals{'/.*/'}{'/^style/i'}[1],
1144 "url(\\1$trans_image_path\\1)"
1145 );
1146 }
1147
1148 $add_attr_to_tag = array(
1149 "/^a$/i" =>
1150 array('target' => '"_blank"')
1151 );
1152
1153 $trusted = tln_sanitize(
1154 $body,
1155 $tag_list,
1156 $rm_tags_with_content,
1157 $self_closing_tags,
1158 $force_tag_closing,
1159 $rm_attnames,
1160 $bad_attvals,
1161 $add_attr_to_tag,
1162 $trans_image_path,
1163 $block_external_images
1164 );
1165 return $trusted;
1166 }