3 * parseHTML is a HTML parser which works with PHP 4 and above.
4 * It tries to handle invalid HTML to some degree.
7 * @author Milian Wolff (mail@milianw.de, http://milianw.de)
8 * @license LGPL, see LICENSE_LGPL.txt and the summary below
9 * @copyright (C) 2007 Milian Wolff
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 * tags which are always empty (<br /> etc.)
31 var $emptyTags = array(
42 * tags with preformatted text
43 * whitespaces wont be touched in them
47 var $preformattedTags = array(
54 * supress HTML tags inside preformatted tags (see above)
58 var $noTagsInCode = false;
68 * - tag (see isStartTag)
69 * - text (includes cdata)
72 * - pi (processing instruction)
78 * current node content, i.e. either a
79 * simple string (text node), or something like
80 * <tag attrib="value"...>
86 * wether current node is an opening tag (<a>) or not (</a>)
87 * set to NULL if current node is not a tag
88 * NOTE: empty tags (<br />) set this to true as well!
92 var $isStartTag = null;
94 * wether current node is an empty tag (<br />) or not (<a></a>)
98 var $isEmptyTag = null;
106 * attributes of current tag
108 * @var array (attribName=>value) | null
110 var $tagAttributes = null;
112 * wether the current tag is a block element
116 var $isBlockElement = null;
123 var $keepWhitespace = 0;
126 * count this to get current depth
130 var $openTags = array();
132 * list of block elements
135 * TODO: what shall we do with <del> and <ins> ?!
137 var $blockElements = array (
138 # tag name => <bool> is block
141 'blockquote' => true,
166 # set table elements and list items to block as well
176 # header items and html / body as well
184 # unfancy media tags, when indented should be rendered as block
228 * get next node, set $this->html prior!
233 function nextNode() {
234 if (empty($this->html
)) {
235 # we are done with parsing the html string
238 static $skipWhitespace = true;
239 if ($this->isStartTag
&& !$this->isEmptyTag
) {
240 array_push($this->openTags
, $this->tagName
);
241 if (in_array($this->tagName
, $this->preformattedTags
)) {
242 # dont truncate whitespaces for <code> or <pre> contents
243 $this->keepWhitespace++
;
247 if ($this->html
[0] == '<') {
248 $token = substr($this->html
, 0, 9);
249 if (substr($token, 0, 2) == '<?') {
250 # xml prolog or other pi's
252 #trigger_error('this might need some work', E_USER_NOTICE);
253 $pos = strpos($this->html
, '>');
254 $this->setNode('pi', $pos +
1);
257 if (substr($token, 0, 4) == '<!--') {
259 $pos = strpos($this->html
, '-->');
260 if ($pos === false) {
261 # could not find a closing -->, use next gt instead
262 # this is firefox' behaviour
263 $pos = strpos($this->html
, '>') +
1;
267 $this->setNode('comment', $pos);
269 $skipWhitespace = true;
272 if ($token == '<!DOCTYPE') {
274 $this->setNode('doctype', strpos($this->html
, '>')+
1);
276 $skipWhitespace = true;
279 if ($token == '<![CDATA[') {
280 # cdata, use text node
282 # remove leading <![CDATA[
283 $this->html
= substr($this->html
, 9);
285 $this->setNode('text', strpos($this->html
, ']]>')+
3);
287 # remove trailing ]]> and trim
288 $this->node
= substr($this->node
, 0, -3);
289 $this->handleWhitespaces();
291 $skipWhitespace = true;
294 if ($this->parseTag()) {
297 if ($this->isBlockElement
) {
298 $skipWhitespace = true;
300 $skipWhitespace = false;
305 if ($this->keepWhitespace
) {
306 $skipWhitespace = false;
308 # when we get here it seems to be a text node
309 $pos = strpos($this->html
, '<');
310 if ($pos === false) {
311 $pos = strlen($this->html
);
313 $this->setNode('text', $pos);
314 $this->handleWhitespaces();
315 if ($skipWhitespace && $this->node
== ' ') {
316 return $this->nextNode();
318 $skipWhitespace = false;
322 * parse tag, set tag name and attributes, see if it's a closing tag and so forth...
327 function parseTag() {
328 static $a_ord, $z_ord, $special_ords;
329 if (!isset($a_ord)) {
332 $special_ords = array(
333 ord(':'), // for xml:lang
334 ord('-'), // for http-equiv
341 $isStartTag = $this->html
[$pos] != '/';
346 while (isset($this->html
[$pos])) {
347 $pos_ord = ord(strtolower($this->html
[$pos]));
348 if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) ||
(!empty($tagName) && is_numeric($this->html
[$pos]))) {
349 $tagName .= $this->html
[$pos];
357 $tagName = strtolower($tagName);
358 if (empty($tagName) ||
!isset($this->blockElements
[$tagName])) {
359 # something went wrong => invalid tag
363 if ($this->noTagsInCode
&& end($this->openTags
) == 'code' && !($tagName == 'code' && !$isStartTag)) {
364 # we supress all HTML tags inside code tags
370 /** TODO: in html 4 attributes do not need to be quoted **/
372 $attributes = array();
374 while (isset($this->html
[$pos+
1])) {
377 if ($this->html
[$pos] == '>' ||
$this->html
[$pos].$this->html
[$pos+
1] == '/>') {
378 if ($this->html
[$pos] == '/') {
385 $pos_ord = ord(strtolower($this->html
[$pos]));
386 if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) ||
in_array($pos_ord, $special_ords)) {
388 $currAttrib .= $this->html
[$pos];
389 } elseif (in_array($this->html
[$pos], array(' ', "\t", "\n"))) {
391 } elseif (in_array($this->html
[$pos].$this->html
[$pos+
1], array('="', "='"))) {
392 # get attribute value
394 $await = $this->html
[$pos]; # single or double quote
397 while (isset($this->html
[$pos]) && $this->html
[$pos] != $await) {
398 $value .= $this->html
[$pos];
401 $attributes[$currAttrib] = $value;
408 if ($this->html
[$pos] != '>') {
413 if (!empty($currAttrib)) {
414 # html 4 allows something like <option selected> instead of <option selected="selected">
415 $attributes[$currAttrib] = $currAttrib;
418 if (!empty($attributes) ||
$tagName != end($this->openTags
)) {
419 # end tags must not contain any attributes
420 # or maybe we did not expect a different tag to be closed
424 array_pop($this->openTags
);
425 if (in_array($tagName, $this->preformattedTags
)) {
426 $this->keepWhitespace
--;
430 $this->node
= substr($this->html
, 0, $pos);
431 $this->html
= substr($this->html
, $pos);
432 $this->tagName
= $tagName;
433 $this->tagAttributes
= $attributes;
434 $this->isStartTag
= $isStartTag;
435 $this->isEmptyTag
= $isEmptyTag ||
in_array($tagName, $this->emptyTags
);
436 if ($this->isEmptyTag
) {
437 # might be not well formed
438 $this->node
= preg_replace('# */? *>$#', ' />', $this->node
);
440 $this->nodeType
= 'tag';
441 $this->isBlockElement
= $this->blockElements
[$tagName];
445 * handle invalid tags
450 function invalidTag() {
451 $this->html
= substr_replace($this->html
, '<', 0, 1);
454 * update all vars and make $this->html shorter
456 * @param string $type see description for $this->nodeType
457 * @param int $pos to which position shall we cut?
460 function setNode($type, $pos) {
461 if ($this->nodeType
== 'tag') {
462 # set tag specific vars to null
463 # $type == tag should not be called here
464 # see this::parseTag() for more
465 $this->tagName
= null;
466 $this->tagAttributes
= null;
467 $this->isStartTag
= null;
468 $this->isEmptyTag
= null;
469 $this->isBlockElement
= null;
472 $this->nodeType
= $type;
473 $this->node
= substr($this->html
, 0, $pos);
474 $this->html
= substr($this->html
, $pos);
477 * check if $this->html begins with $str
482 function match($str) {
483 return substr($this->html
, 0, strlen($str)) == $str;
486 * truncate whitespaces
491 function handleWhitespaces() {
492 if ($this->keepWhitespace
) {
493 # <pre> or <code> before...
496 # truncate multiple whitespaces to a single one
497 $this->node
= preg_replace('#\s+#s', ' ', $this->node
);
500 * normalize self::node
505 function normalizeNode() {
507 if (!$this->isStartTag
) {
508 $this->node
.= '/'.$this->tagName
.'>';
511 $this->node
.= $this->tagName
;
512 foreach ($this->tagAttributes
as $name => $value) {
513 $this->node
.= ' '.$name.'="'.str_replace('"', '"', $value).'"';
515 if ($this->isEmptyTag
) {
523 * indent a HTML string properly
525 * @param string $html
526 * @param string $indent optional
529 function indentHTML($html, $indent = " ", $noTagsInCode = false) {
530 $parser = new parseHTML
;
531 $parser->noTagsInCode
= $noTagsInCode;
532 $parser->html
= $html;
534 $last = true; # last tag was block elem
536 while($parser->nextNode()) {
537 if ($parser->nodeType
== 'tag') {
538 $parser->normalizeNode();
540 if ($parser->nodeType
== 'tag' && $parser->isBlockElement
) {
541 $isPreOrCode = in_array($parser->tagName
, array('code', 'pre'));
542 if (!$parser->keepWhitespace
&& !$last && !$isPreOrCode) {
543 $html = rtrim($html)."\n";
545 if ($parser->isStartTag
) {
546 $html .= implode($indent_a);
547 if (!$parser->isEmptyTag
) {
548 array_push($indent_a, $indent);
551 array_pop($indent_a);
553 $html .= implode($indent_a);
556 $html .= $parser->node
;
557 if (!$parser->keepWhitespace
&& !($isPreOrCode && $parser->isStartTag
)) {
562 if ($parser->nodeType
== 'tag' && $parser->tagName
== 'br') {
563 $html .= $parser->node
."\n";
566 } elseif ($last && !$parser->keepWhitespace
) {
567 $html .= implode($indent_a);
568 $parser->node
= ltrim($parser->node
);
570 $html .= $parser->node
;
572 if (in_array($parser->nodeType
, array('comment', 'pi', 'doctype'))) {
583 error_reporting(E_ALL);
585 $html = '<p>Simple block on one line:</p>
589 <p>And nested without indentation:</p>
601 <p>And with attributes:</p>
608 <p>This was broken in 1.0.2b7:</p>
610 <div class="inlinepage">
611 <div class="toggleableend">
615 #$html = '<a href="asdfasdf" title=\'asdf\' foo="bar">asdf</a>';
616 echo indentHTML($html);