[PLUGINS] +abomailman
[ptitvelo/web/www.git] / www / plugins / facteur / lib / markdownify / parsehtml / parsehtml.php
1 <?php
2 /**
3 * parseHTML is a HTML parser which works with PHP 4 and above.
4 * It tries to handle invalid HTML to some degree.
5 *
6 * @version 1.0 beta
7 * @author Milian Wolff (mail@milianw.de, http://milianw.de)
8 * @license LGPL, see LICENSE_LGPL.txt and the summary below
9 * @copyright (C) 2007 Milian Wolff
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25 class parseHTML {
26 /**
27 * tags which are always empty (<br /> etc.)
28 *
29 * @var array<string>
30 */
31 var $emptyTags = array(
32 'br',
33 'hr',
34 'input',
35 'img',
36 'area',
37 'link',
38 'meta',
39 'param',
40 );
41 /**
42 * tags with preformatted text
43 * whitespaces wont be touched in them
44 *
45 * @var array<string>
46 */
47 var $preformattedTags = array(
48 'script',
49 'style',
50 'pre',
51 'code',
52 );
53 /**
54 * supress HTML tags inside preformatted tags (see above)
55 *
56 * @var bool
57 */
58 var $noTagsInCode = false;
59 /**
60 * html to be parsed
61 *
62 * @var string
63 */
64 var $html = '';
65 /**
66 * node type:
67 *
68 * - tag (see isStartTag)
69 * - text (includes cdata)
70 * - comment
71 * - doctype
72 * - pi (processing instruction)
73 *
74 * @var string
75 */
76 var $nodeType = '';
77 /**
78 * current node content, i.e. either a
79 * simple string (text node), or something like
80 * <tag attrib="value"...>
81 *
82 * @var string
83 */
84 var $node = '';
85 /**
86 * wether current node is an opening tag (<a>) or not (</a>)
87 * set to NULL if current node is not a tag
88 * NOTE: empty tags (<br />) set this to true as well!
89 *
90 * @var bool | null
91 */
92 var $isStartTag = null;
93 /**
94 * wether current node is an empty tag (<br />) or not (<a></a>)
95 *
96 * @var bool | null
97 */
98 var $isEmptyTag = null;
99 /**
100 * tag name
101 *
102 * @var string | null
103 */
104 var $tagName = '';
105 /**
106 * attributes of current tag
107 *
108 * @var array (attribName=>value) | null
109 */
110 var $tagAttributes = null;
111 /**
112 * wether the current tag is a block element
113 *
114 * @var bool | null
115 */
116 var $isBlockElement = null;
117
118 /**
119 * keep whitespace
120 *
121 * @var int
122 */
123 var $keepWhitespace = 0;
124 /**
125 * list of open tags
126 * count this to get current depth
127 *
128 * @var array
129 */
130 var $openTags = array();
131 /**
132 * list of block elements
133 *
134 * @var array
135 * TODO: what shall we do with <del> and <ins> ?!
136 */
137 var $blockElements = array (
138 # tag name => <bool> is block
139 # block elements
140 'address' => true,
141 'blockquote' => true,
142 'center' => true,
143 'del' => true,
144 'dir' => true,
145 'div' => true,
146 'dl' => true,
147 'fieldset' => true,
148 'form' => true,
149 'h1' => true,
150 'h2' => true,
151 'h3' => true,
152 'h4' => true,
153 'h5' => true,
154 'h6' => true,
155 'hr' => true,
156 'ins' => true,
157 'isindex' => true,
158 'menu' => true,
159 'noframes' => true,
160 'noscript' => true,
161 'ol' => true,
162 'p' => true,
163 'pre' => true,
164 'table' => true,
165 'ul' => true,
166 # set table elements and list items to block as well
167 'thead' => true,
168 'tbody' => true,
169 'tfoot' => true,
170 'td' => true,
171 'tr' => true,
172 'th' => true,
173 'li' => true,
174 'dd' => true,
175 'dt' => true,
176 # header items and html / body as well
177 'html' => true,
178 'body' => true,
179 'head' => true,
180 'meta' => true,
181 'link' => true,
182 'style' => true,
183 'title' => true,
184 # unfancy media tags, when indented should be rendered as block
185 'map' => true,
186 'object' => true,
187 'param' => true,
188 'embed' => true,
189 'area' => true,
190 # inline elements
191 'a' => false,
192 'abbr' => false,
193 'acronym' => false,
194 'applet' => false,
195 'b' => false,
196 'basefont' => false,
197 'bdo' => false,
198 'big' => false,
199 'br' => false,
200 'button' => false,
201 'cite' => false,
202 'code' => false,
203 'del' => false,
204 'dfn' => false,
205 'em' => false,
206 'font' => false,
207 'i' => false,
208 'img' => false,
209 'ins' => false,
210 'input' => false,
211 'iframe' => false,
212 'kbd' => false,
213 'label' => false,
214 'q' => false,
215 'samp' => false,
216 'script' => false,
217 'select' => false,
218 'small' => false,
219 'span' => false,
220 'strong' => false,
221 'sub' => false,
222 'sup' => false,
223 'textarea' => false,
224 'tt' => false,
225 'var' => false,
226 );
227 /**
228 * get next node, set $this->html prior!
229 *
230 * @param void
231 * @return bool
232 */
233 function nextNode() {
234 if (empty($this->html)) {
235 # we are done with parsing the html string
236 return false;
237 }
238 static $skipWhitespace = true;
239 if ($this->isStartTag && !$this->isEmptyTag) {
240 array_push($this->openTags, $this->tagName);
241 if (in_array($this->tagName, $this->preformattedTags)) {
242 # dont truncate whitespaces for <code> or <pre> contents
243 $this->keepWhitespace++;
244 }
245 }
246
247 if ($this->html[0] == '<') {
248 $token = substr($this->html, 0, 9);
249 if (substr($token, 0, 2) == '<?') {
250 # xml prolog or other pi's
251 /** TODO **/
252 #trigger_error('this might need some work', E_USER_NOTICE);
253 $pos = strpos($this->html, '>');
254 $this->setNode('pi', $pos + 1);
255 return true;
256 }
257 if (substr($token, 0, 4) == '<!--') {
258 # comment
259 $pos = strpos($this->html, '-->');
260 if ($pos === false) {
261 # could not find a closing -->, use next gt instead
262 # this is firefox' behaviour
263 $pos = strpos($this->html, '>') + 1;
264 } else {
265 $pos += 3;
266 }
267 $this->setNode('comment', $pos);
268
269 $skipWhitespace = true;
270 return true;
271 }
272 if ($token == '<!DOCTYPE') {
273 # doctype
274 $this->setNode('doctype', strpos($this->html, '>')+1);
275
276 $skipWhitespace = true;
277 return true;
278 }
279 if ($token == '<![CDATA[') {
280 # cdata, use text node
281
282 # remove leading <![CDATA[
283 $this->html = substr($this->html, 9);
284
285 $this->setNode('text', strpos($this->html, ']]>')+3);
286
287 # remove trailing ]]> and trim
288 $this->node = substr($this->node, 0, -3);
289 $this->handleWhitespaces();
290
291 $skipWhitespace = true;
292 return true;
293 }
294 if ($this->parseTag()) {
295 # seems to be a tag
296 # handle whitespaces
297 if ($this->isBlockElement) {
298 $skipWhitespace = true;
299 } else {
300 $skipWhitespace = false;
301 }
302 return true;
303 }
304 }
305 if ($this->keepWhitespace) {
306 $skipWhitespace = false;
307 }
308 # when we get here it seems to be a text node
309 $pos = strpos($this->html, '<');
310 if ($pos === false) {
311 $pos = strlen($this->html);
312 }
313 $this->setNode('text', $pos);
314 $this->handleWhitespaces();
315 if ($skipWhitespace && $this->node == ' ') {
316 return $this->nextNode();
317 }
318 $skipWhitespace = false;
319 return true;
320 }
321 /**
322 * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
323 *
324 * @param void
325 * @return bool
326 */
327 function parseTag() {
328 static $a_ord, $z_ord, $special_ords;
329 if (!isset($a_ord)) {
330 $a_ord = ord('a');
331 $z_ord = ord('z');
332 $special_ords = array(
333 ord(':'), // for xml:lang
334 ord('-'), // for http-equiv
335 );
336 }
337
338 $tagName = '';
339
340 $pos = 1;
341 $isStartTag = $this->html[$pos] != '/';
342 if (!$isStartTag) {
343 $pos++;
344 }
345 # get tagName
346 while (isset($this->html[$pos])) {
347 $pos_ord = ord(strtolower($this->html[$pos]));
348 if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
349 $tagName .= $this->html[$pos];
350 $pos++;
351 } else {
352 $pos--;
353 break;
354 }
355 }
356
357 $tagName = strtolower($tagName);
358 if (empty($tagName) || !isset($this->blockElements[$tagName])) {
359 # something went wrong => invalid tag
360 $this->invalidTag();
361 return false;
362 }
363 if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
364 # we supress all HTML tags inside code tags
365 $this->invalidTag();
366 return false;
367 }
368
369 # get tag attributes
370 /** TODO: in html 4 attributes do not need to be quoted **/
371 $isEmptyTag = false;
372 $attributes = array();
373 $currAttrib = '';
374 while (isset($this->html[$pos+1])) {
375 $pos++;
376 # close tag
377 if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
378 if ($this->html[$pos] == '/') {
379 $isEmptyTag = true;
380 $pos++;
381 }
382 break;
383 }
384
385 $pos_ord = ord(strtolower($this->html[$pos]));
386 if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
387 # attribute name
388 $currAttrib .= $this->html[$pos];
389 } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
390 # drop whitespace
391 } elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
392 # get attribute value
393 $pos++;
394 $await = $this->html[$pos]; # single or double quote
395 $pos++;
396 $value = '';
397 while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
398 $value .= $this->html[$pos];
399 $pos++;
400 }
401 $attributes[$currAttrib] = $value;
402 $currAttrib = '';
403 } else {
404 $this->invalidTag();
405 return false;
406 }
407 }
408 if ($this->html[$pos] != '>') {
409 $this->invalidTag();
410 return false;
411 }
412
413 if (!empty($currAttrib)) {
414 # html 4 allows something like <option selected> instead of <option selected="selected">
415 $attributes[$currAttrib] = $currAttrib;
416 }
417 if (!$isStartTag) {
418 if (!empty($attributes) || $tagName != end($this->openTags)) {
419 # end tags must not contain any attributes
420 # or maybe we did not expect a different tag to be closed
421 $this->invalidTag();
422 return false;
423 }
424 array_pop($this->openTags);
425 if (in_array($tagName, $this->preformattedTags)) {
426 $this->keepWhitespace--;
427 }
428 }
429 $pos++;
430 $this->node = substr($this->html, 0, $pos);
431 $this->html = substr($this->html, $pos);
432 $this->tagName = $tagName;
433 $this->tagAttributes = $attributes;
434 $this->isStartTag = $isStartTag;
435 $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
436 if ($this->isEmptyTag) {
437 # might be not well formed
438 $this->node = preg_replace('# */? *>$#', ' />', $this->node);
439 }
440 $this->nodeType = 'tag';
441 $this->isBlockElement = $this->blockElements[$tagName];
442 return true;
443 }
444 /**
445 * handle invalid tags
446 *
447 * @param void
448 * @return void
449 */
450 function invalidTag() {
451 $this->html = substr_replace($this->html, '&lt;', 0, 1);
452 }
453 /**
454 * update all vars and make $this->html shorter
455 *
456 * @param string $type see description for $this->nodeType
457 * @param int $pos to which position shall we cut?
458 * @return void
459 */
460 function setNode($type, $pos) {
461 if ($this->nodeType == 'tag') {
462 # set tag specific vars to null
463 # $type == tag should not be called here
464 # see this::parseTag() for more
465 $this->tagName = null;
466 $this->tagAttributes = null;
467 $this->isStartTag = null;
468 $this->isEmptyTag = null;
469 $this->isBlockElement = null;
470
471 }
472 $this->nodeType = $type;
473 $this->node = substr($this->html, 0, $pos);
474 $this->html = substr($this->html, $pos);
475 }
476 /**
477 * check if $this->html begins with $str
478 *
479 * @param string $str
480 * @return bool
481 */
482 function match($str) {
483 return substr($this->html, 0, strlen($str)) == $str;
484 }
485 /**
486 * truncate whitespaces
487 *
488 * @param void
489 * @return void
490 */
491 function handleWhitespaces() {
492 if ($this->keepWhitespace) {
493 # <pre> or <code> before...
494 return;
495 }
496 # truncate multiple whitespaces to a single one
497 $this->node = preg_replace('#\s+#s', ' ', $this->node);
498 }
499 /**
500 * normalize self::node
501 *
502 * @param void
503 * @return void
504 */
505 function normalizeNode() {
506 $this->node = '<';
507 if (!$this->isStartTag) {
508 $this->node .= '/'.$this->tagName.'>';
509 return;
510 }
511 $this->node .= $this->tagName;
512 foreach ($this->tagAttributes as $name => $value) {
513 $this->node .= ' '.$name.'="'.str_replace('"', '&quot;', $value).'"';
514 }
515 if ($this->isEmptyTag) {
516 $this->node .= ' /';
517 }
518 $this->node .= '>';
519 }
520 }
521
522 /**
523 * indent a HTML string properly
524 *
525 * @param string $html
526 * @param string $indent optional
527 * @return string
528 */
529 function indentHTML($html, $indent = " ", $noTagsInCode = false) {
530 $parser = new parseHTML;
531 $parser->noTagsInCode = $noTagsInCode;
532 $parser->html = $html;
533 $html = '';
534 $last = true; # last tag was block elem
535 $indent_a = array();
536 while($parser->nextNode()) {
537 if ($parser->nodeType == 'tag') {
538 $parser->normalizeNode();
539 }
540 if ($parser->nodeType == 'tag' && $parser->isBlockElement) {
541 $isPreOrCode = in_array($parser->tagName, array('code', 'pre'));
542 if (!$parser->keepWhitespace && !$last && !$isPreOrCode) {
543 $html = rtrim($html)."\n";
544 }
545 if ($parser->isStartTag) {
546 $html .= implode($indent_a);
547 if (!$parser->isEmptyTag) {
548 array_push($indent_a, $indent);
549 }
550 } else {
551 array_pop($indent_a);
552 if (!$isPreOrCode) {
553 $html .= implode($indent_a);
554 }
555 }
556 $html .= $parser->node;
557 if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) {
558 $html .= "\n";
559 }
560 $last = true;
561 } else {
562 if ($parser->nodeType == 'tag' && $parser->tagName == 'br') {
563 $html .= $parser->node."\n";
564 $last = true;
565 continue;
566 } elseif ($last && !$parser->keepWhitespace) {
567 $html .= implode($indent_a);
568 $parser->node = ltrim($parser->node);
569 }
570 $html .= $parser->node;
571
572 if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) {
573 $html .= "\n";
574 } else {
575 $last = false;
576 }
577 }
578 }
579 return $html;
580 }
581 /*
582 # testcase / example
583 error_reporting(E_ALL);
584
585 $html = '<p>Simple block on one line:</p>
586
587 <div>foo</div>
588
589 <p>And nested without indentation:</p>
590
591 <div>
592 <div>
593 <div>
594 foo
595 </div>
596 <div style=">"/>
597 </div>
598 <div>bar</div>
599 </div>
600
601 <p>And with attributes:</p>
602
603 <div>
604 <div id="foo">
605 </div>
606 </div>
607
608 <p>This was broken in 1.0.2b7:</p>
609
610 <div class="inlinepage">
611 <div class="toggleableend">
612 foo
613 </div>
614 </div>';
615 #$html = '<a href="asdfasdf" title=\'asdf\' foo="bar">asdf</a>';
616 echo indentHTML($html);
617 die();
618 */