[SPIP] ~maj v3.0.14-->v3.0.17
[ptitvelo/web/www.git] / www / plugins-dist / safehtml / lib / safehtml / classes / safehtml.php
1 <?php
2 /**
3 * SafeHTML Parser
4 *
5 * @package SafeHTML
6 * @author Roman Ivanov <thingol@mail.ru>
7 * @copyright 2004-2005 Roman Ivanov
8 * @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
9 * @version 1.3.7
10 * @link http://pixel-apes.com/safehtml/
11 */
12
13 if (!defined('_ECRIRE_INC_VERSION')) return;
14
15 require_once(XML_HTMLSAX3 . 'HTMLSax3.php');
16
17 class SafeHTML
18 {
19 var $_xhtml = '';
20
21 var $_counter = array();
22
23 var $_stack = array();
24
25 var $_dcCounter = array();
26
27 var $_dcStack = array();
28
29 var $_listScope = 0;
30
31 var $_liStack = array();
32
33 var $_protoRegexps = array();
34
35 var $_cssRegexps = array();
36
37 var $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
38
39 var $deleteTags = array(
40 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body',
41 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer',
42 'iframe', 'layer', 'link', 'meta', 'object', 'style',
43 'title', 'script',
44 );
45
46 var $deleteTagsContent = array('script', 'style', 'title', 'xml', );
47
48 var $protocolFiltering = 'white';
49
50 var $blackProtocols = array(
51 'about', 'chrome', 'data', 'disk', 'hcp',
52 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec',
53 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera',
54 'res', 'resource', 'shell', 'vbscript', 'view-source',
55 'vnd.ms.radio', 'wysiwyg',
56 );
57
58 var $whiteProtocols = array(
59 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https',
60 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal',
61 'xmpp', 'callto',
62 );
63
64 var $protocolAttributes = array(
65 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
66 );
67
68 var $cssKeywords = array(
69 'absolute', 'behavior', 'behaviour', 'content', 'expression',
70 'fixed', 'include-source', 'moz-binding',
71 );
72
73 var $noClose = array();
74
75 var $closeParagraph = array(
76 'address', 'blockquote', 'center', 'dd', 'dir', 'div',
77 'dl', 'dt', 'h1', 'h2', 'h3', 'h4',
78 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee',
79 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre',
80 'table', 'ul', 'xmp',
81 );
82
83 var $tableTags = array(
84 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
85 'thead', 'tr',
86 );
87
88 var $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
89
90 var $attributes = array('dynsrc', 'id', 'name', );
91
92 var $attributesNS = array('xml:lang', );
93
94 function SafeHTML()
95 {
96 //making regular expressions based on Proto & CSS arrays
97 foreach ($this->blackProtocols as $proto) {
98 $preg = "/[\s\x01-\x1F]*";
99 for ($i=0; $i<strlen($proto); $i++) {
100 $preg .= $proto{$i} . "[\s\x01-\x1F]*";
101 }
102 $preg .= ":/i";
103 $this->_protoRegexps[] = $preg;
104 }
105
106 foreach ($this->cssKeywords as $css) {
107 $this->_cssRegexps[] = '/' . $css . '/i';
108 }
109 return true;
110 }
111
112 function _writeAttrs ($attrs,$tag=null)
113 {
114 if (is_array($attrs)) {
115 foreach ($attrs as $name => $value) {
116
117 $name = strtolower($name);
118
119 if (strpos($name, 'on') === 0) {
120 continue;
121 }
122 if (strpos($name, 'data') === 0) {
123 continue;
124 }
125 if ($tag !='a' AND in_array($name, $this->attributes)) {
126 continue;
127 }
128 if (!preg_match("/^[a-z0-9-]+$/i", $name)) {
129 if (!in_array($name, $this->attributesNS))
130 {
131 continue;
132 }
133 }
134
135 if (($value === TRUE) || (is_null($value))) {
136 $value = $name;
137 }
138
139 if ($name == 'style') {
140
141 // removes insignificant backslahes
142 $value = str_replace("\\", '', $value);
143
144 // removes CSS comments
145 while (1)
146 {
147 $_value = preg_replace("!/\*.*?\*/!s", '', $value);
148 if ($_value == $value) break;
149 $value = $_value;
150 }
151
152 // replace all & to &amp;
153 $value = str_replace('&amp;', '&', $value);
154 $value = str_replace('&', '&amp;', $value);
155
156 foreach ($this->_cssRegexps as $css) {
157 if (preg_match($css, $value)) {
158 continue 2;
159 }
160 }
161 foreach ($this->_protoRegexps as $proto) {
162 if (preg_match($proto, $value)) {
163 continue 2;
164 }
165 }
166 }
167
168 $tempval = preg_replace_callback('/&#(\d+);?/m', create_function('$m', 'return chr($m[1]);'), $value);
169 $tempval = preg_replace_callback('/&#x([0-9a-f]+);?/mi', create_function('$m', 'return chr(hexdec($m[1]));'), $tempval);
170
171 if ((in_array($name, $this->protocolAttributes)) &&
172 (strpos($tempval, ':') !== false))
173 {
174 if ($this->protocolFiltering == 'black') {
175 foreach ($this->_protoRegexps as $proto) {
176 if (preg_match($proto, $tempval)) continue 2;
177 }
178 } else {
179 $_tempval = explode(':', $tempval);
180 $proto = $_tempval[0];
181 if (!in_array($proto, $this->whiteProtocols)) {
182 continue;
183 }
184 }
185 }
186
187 $value = str_replace("\"", "&quot;", $value);
188 $this->_xhtml .= ' ' . $name . '="' . $value . '"';
189 }
190 }
191 return true;
192 }
193
194 function _openHandler(&$parser, $name, $attrs)
195 {
196 $name = strtolower($name);
197
198 if (in_array($name, $this->deleteTagsContent)) {
199 array_push($this->_dcStack, $name);
200 $this->_dcCounter[$name] = isset($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+1 : 1;
201 }
202 if (count($this->_dcStack) != 0) {
203 return true;
204 }
205
206 if (in_array($name, $this->deleteTags)) {
207 return true;
208 }
209
210 if (!preg_match("/^[a-z0-9]+$/i", $name)) {
211 if (preg_match("!(?:\@|://)!i", $name)) {
212 $this->_xhtml .= '&lt;' . $name . '&gt;';
213 }
214 return true;
215 }
216
217 if (in_array($name, $this->singleTags)) {
218 $this->_xhtml .= '<' . $name;
219 $this->_writeAttrs($attrs);
220 $this->_xhtml .= ' />';
221 return true;
222 }
223
224 // TABLES: cannot open table elements when we are not inside table
225 if ((isset($this->_counter['table'])) && ($this->_counter['table'] <= 0)
226 && (in_array($name, $this->tableTags)))
227 {
228 return true;
229 }
230
231 // PARAGRAPHS: close paragraph when closeParagraph tags opening
232 if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->_stack))) {
233 $this->_closeHandler($parser, 'p');
234 }
235
236 // LISTS: we should close <li> if <li> of the same level opening
237 if ($name == 'li' && count($this->_liStack) &&
238 $this->_listScope == $this->_liStack[count($this->_liStack)-1])
239 {
240 $this->_closeHandler($parser, 'li');
241 }
242
243 // LISTS: we want to know on what nesting level of lists we are
244 if (in_array($name, $this->listTags)) {
245 $this->_listScope++;
246 }
247 if ($name == 'li') {
248 array_push($this->_liStack, $this->_listScope);
249 }
250
251 $this->_xhtml .= '<' . $name;
252 $this->_writeAttrs($attrs,$name);
253 $this->_xhtml .= '>';
254 array_push($this->_stack,$name);
255 $this->_counter[$name] = isset($this->_counter[$name]) ? $this->_counter[$name]+1 : 1;
256 return true;
257 }
258
259 function _closeHandler(&$parser, $name)
260 {
261
262 $name = strtolower($name);
263
264 if (isset($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) &&
265 (in_array($name, $this->deleteTagsContent)))
266 {
267 while ($name != ($tag = array_pop($this->_dcStack))) {
268 $this->_dcCounter[$tag]--;
269 }
270
271 $this->_dcCounter[$name]--;
272 }
273
274 if (count($this->_dcStack) != 0) {
275 return true;
276 }
277
278 if ((isset($this->_counter[$name])) && ($this->_counter[$name] > 0)) {
279 while ($name != ($tag = array_pop($this->_stack))) {
280 $this->_closeTag($tag);
281 }
282
283 $this->_closeTag($name);
284 }
285 return true;
286 }
287
288 function _closeTag($tag)
289 {
290 if (!in_array($tag, $this->noClose)) {
291 $this->_xhtml .= '</' . $tag . '>';
292 }
293
294 $this->_counter[$tag]--;
295
296 if (in_array($tag, $this->listTags)) {
297 $this->_listScope--;
298 }
299
300 if ($tag == 'li') {
301 array_pop($this->_liStack);
302 }
303 return true;
304 }
305
306 function _dataHandler(&$parser, $data)
307 {
308 if (count($this->_dcStack) == 0) {
309 $this->_xhtml .= $data;
310 }
311 return true;
312 }
313
314 function _escapeHandler(&$parser, $data)
315 {
316 return true;
317 }
318
319 function getXHTML ()
320 {
321 while ($tag = array_pop($this->_stack)) {
322 $this->_closeTag($tag);
323 }
324
325 return $this->_xhtml;
326 }
327
328 function clear()
329 {
330 $this->_xhtml = '';
331 return true;
332 }
333
334 function parse($doc)
335 {
336
337 // Save all '<' symbols
338 $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '&lt;', $doc);
339
340 // Web documents shouldn't contains \x00 symbol
341 $doc = str_replace("\x00", '', $doc);
342
343 // Opera6 bug workaround
344 $doc = str_replace("\xC0\xBC", '&lt;', $doc);
345
346 // UTF-7 encoding ASCII decode
347 $doc = $this->repackUTF7($doc);
348
349 // Instantiate the parser
350 $parser= new XML_HTMLSax3();
351
352 // Set up the parser
353 $parser->set_object($this);
354
355 $parser->set_element_handler('_openHandler','_closeHandler');
356 $parser->set_data_handler('_dataHandler');
357 $parser->set_escape_handler('_escapeHandler');
358
359 $parser->parse($doc);
360
361 return $this->getXHTML();
362
363 }
364
365 function repackUTF7($str)
366 {
367 return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
368 }
369
370 function repackUTF7Callback($str)
371 {
372 $str = base64_decode($str[1]);
373 $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
374 return preg_replace('/\x00(.)/', '$1', $str);
375 }
376
377 function repackUTF7Back($str)
378 {
379 return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
380 }
381 }
382
383 ?>