2 /* Copyright (C) 2008 Guy Van den Broeck <guy@guyvdb.eu>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * or see http://www.gnu.org/
21 * Any element in the DOM tree of an HTML document.
27 protected $parentTree;
29 public $whiteBefore = false;
31 public $whiteAfter = false;
33 function __construct($parent) {
34 $this->parent
= $parent;
37 public function getParentTree() {
38 if (!isset($this->parentTree
)) {
39 if (!is_null($this->parent
)) {
40 $this->parentTree
= $this->parent
->getParentTree();
41 $this->parentTree
[] = $this->parent
;
43 $this->parentTree
= array();
46 return $this->parentTree
;
49 public function getLastCommonParent(Node
$other) {
50 $result = new LastCommonParentResult();
52 $myParents = $this->getParentTree();
53 $otherParents = $other->getParentTree();
57 $nbMyParents = count($myParents);
58 $nbOtherParents = count($otherParents);
59 while ($isSame && $i < $nbMyParents && $i < $nbOtherParents) {
60 if (!$myParents[$i]->openingTag
=== $otherParents[$i]->openingTag
) {
63 // After a while, the index i-1 must be the last common parent
68 $result->lastCommonParentDepth
= $i - 1;
69 $result->parent
= $myParents[$i - 1];
71 if (!$isSame ||
$nbMyParents > $nbOtherParents) {
72 // Not all tags matched, or all tags matched but
73 // there are tags left in this tree
74 $result->indexInLastCommonParent
= $myParents[$i - 1]->getIndexOf($myParents[$i]);
75 $result->splittingNeeded
= true;
76 } else if ($nbMyParents <= $nbOtherParents) {
77 $result->indexInLastCommonParent
= $myParents[$i - 1]->getIndexOf($this);
82 public function setParent($parent) {
83 $this->parent
= $parent;
84 unset($this->parentTree
);
87 public function inPre() {
88 $tree = $this->getParentTree();
89 foreach ($tree as &$ancestor) {
90 if ($ancestor->isPre()) {
99 * Node that can contain other nodes. Represents an HTML tag.
101 class TagNode
extends Node
{
103 public $children = array();
107 public $attributes = array();
111 function __construct($parent, $qName, /*array*/ $attributes) {
112 parent
::__construct($parent);
113 $this->qName
= strtolower($qName);
114 foreach($attributes as $key => &$value){
115 $this->attributes
[strtolower($key)] = $value;
117 return $this->openingTag
= Xml
::openElement($this->qName
, $this->attributes
);
120 public function addChildAbsolute(Node
$node, $index) {
121 array_splice($this->children
, $index, 0, array($node));
124 public function getIndexOf(Node
$child) {
125 // don't trust array_search with objects
126 foreach ($this->children
as $key => &$value){
127 if ($value === $child) {
134 public function getNbChildren() {
135 return count($this->children
);
138 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
142 $somethingDeleted = false;
143 $hasNonDeletedDescendant = false;
145 if (empty($this->children
)) {
149 foreach ($this->children
as &$child) {
150 $allDeleted_local = false;
151 $somethingDeleted_local = false;
152 $childrenChildren = $child->getMinimalDeletedSet($id, $allDeleted_local, $somethingDeleted_local);
153 if ($somethingDeleted_local) {
154 $nodes = array_merge($nodes, $childrenChildren);
155 $somethingDeleted = true;
157 if (!$allDeleted_local) {
158 $hasNonDeletedDescendant = true;
161 if (!$hasNonDeletedDescendant) {
162 $nodes = array($this);
168 public function splitUntil(TagNode
$parent, Node
$split, $includeLeft) {
169 $splitOccured = false;
170 if ($parent !== $this) {
171 $part1 = new TagNode(null, $this->qName
, $this->attributes
);
172 $part2 = new TagNode(null, $this->qName
, $this->attributes
);
173 $part1->setParent($this->parent
);
174 $part2->setParent($this->parent
);
178 foreach ($this->children
as &$child)
180 if ($child === $split) {
183 if(!$pastSplit ||
($onSplit && $includeLeft)) {
184 $child->setParent($part1);
185 $part1->children
[] = $child;
187 $child->setParent($part2);
188 $part2->children
[] = $child;
195 $myindexinparent = $this->parent
->getIndexOf($this);
196 if (!empty($part1->children
)) {
197 $this->parent
->addChildAbsolute($part1, $myindexinparent);
199 if (!empty($part2->children
)) {
200 $this->parent
->addChildAbsolute($part2, $myindexinparent);
202 if (!empty($part1->children
) && !empty($part2->children
)) {
203 $splitOccured = true;
206 $this->parent
->removeChild($myindexinparent);
209 $this->parent
->splitUntil($parent, $part1, $includeLeft);
211 $this->parent
->splitUntil($parent, $part2, $includeLeft);
214 return $splitOccured;
218 private function removeChild($index) {
219 unset($this->children
[$index]);
220 $this->children
= array_values($this->children
);
223 public static $blocks = array('html', 'body','p','blockquote', 'h1',
224 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol', 'li', 'table',
225 'tbody', 'tr', 'td', 'th', 'br');
227 public function copyTree() {
228 $newThis = new TagNode(null, $this->qName
, $this->attributes
);
229 $newThis->whiteBefore
= $this->whiteBefore
;
230 $newThis->whiteAfter
= $this->whiteAfter
;
231 foreach ($this->children
as &$child) {
232 $newChild = $child->copyTree();
233 $newChild->setParent($newThis);
234 $newThis->children
[] = $newChild;
239 public function getMatchRatio(TagNode
$other) {
240 $txtComp = new TextOnlyComparator($other);
241 return $txtComp->getMatchRatio(new TextOnlyComparator($this));
244 public function expandWhiteSpace() {
248 $nbOriginalChildren = $this->getNbChildren();
249 for ($i = 0; $i < $nbOriginalChildren; ++
$i) {
250 $child = $this->children
[$i +
$shift];
252 if ($child instanceof TagNode
) {
253 if (!$child->isPre()) {
254 $child->expandWhiteSpace();
257 if (!$spaceAdded && $child->whiteBefore
) {
258 $ws = new WhiteSpaceNode(null, ' ', $child->getLeftMostChild());
259 $ws->setParent($this);
260 $this->addChildAbsolute($ws,$i +
($shift++
));
262 if ($child->whiteAfter
) {
263 $ws = new WhiteSpaceNode(null, ' ', $child->getRightMostChild());
264 $ws->setParent($this);
265 $this->addChildAbsolute($ws,$i +
1 +
($shift++
));
274 public function getLeftMostChild() {
275 if (empty($this->children
)) {
278 return $this->children
[0]->getLeftMostChild();
281 public function getRightMostChild() {
282 if (empty($this->children
)) {
285 return $this->children
[$this->getNbChildren() - 1]->getRightMostChild();
288 public function isPre() {
289 return 0 == strcasecmp($this->qName
,'pre');
292 public static function toDiffLine(TagNode
$node) {
293 return $node->openingTag
;
298 * Represents a piece of text in the HTML file.
300 class TextNode
extends Node
{
304 public $modification;
306 function __construct($parent, $text) {
307 parent
::__construct($parent);
308 $this->modification
= new Modification(Modification
::NONE
);
312 public function copyTree() {
313 $clone = clone $this;
314 $clone->setParent(null);
318 public function getLeftMostChild() {
322 public function getRightMostChild() {
326 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
327 if ($this->modification
->type
== Modification
::REMOVED
328 && $this->modification
->id
== $id){
329 $somethingDeleted = true;
336 public function isSameText($other) {
337 if (is_null($other) ||
! $other instanceof TextNode
) {
340 return str_replace('\n', ' ',$this->text
) === str_replace('\n', ' ',$other->text
);
343 public static function toDiffLine(TextNode
$node) {
344 return str_replace('\n', ' ',$node->text
);
348 class WhiteSpaceNode
extends TextNode
{
350 function __construct($parent, $s, Node
$like = null) {
351 parent
::__construct($parent, $s);
352 if(!is_null($like) && $like instanceof TextNode
) {
353 $newModification = clone $like->modification
;
354 $newModification->firstOfID
= false;
355 $this->modification
= $newModification;
361 * Represents the root of a HTML document.
363 class BodyNode
extends TagNode
{
365 function __construct() {
366 parent
::__construct(null, 'body', array());
369 public function copyTree() {
370 $newThis = new BodyNode();
371 foreach ($this->children
as &$child) {
372 $newChild = $child->copyTree();
373 $newChild->setParent($newThis);
374 $newThis->children
[] = $newChild;
379 public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
381 foreach ($this->children
as &$child) {
382 $childrenChildren = $child->getMinimalDeletedSet($id,
383 $allDeleted, $somethingDeleted);
384 $nodes = array_merge($nodes, $childrenChildren);
392 * Represents an image in HTML. Even though images do not contain any text they
393 * are independent visible objects on the page. They are logically a TextNode.
395 class ImageNode
extends TextNode
{
399 function __construct(TagNode
$parent, /*array*/ $attrs) {
400 if(!array_key_exists('src', $attrs)) {
401 //wfDebug('Image without a source:');
402 //foreach ($attrs as $key => &$value) {
403 //wfDebug("$key = $value");
405 parent
::__construct($parent, '<img></img>');
407 parent
::__construct($parent, '<img>' . strtolower($attrs['src']) . '</img>');
409 $this->attributes
= $attrs;
412 public function isSameText($other) {
413 if (is_null($other) ||
! $other instanceof ImageNode
) {
416 return $this->text
=== $other->text
;
421 class DummyNode
extends Node
{
423 function __construct() {
430 * When detecting the last common parent of two nodes, all results are stored as
431 * a LastCommonParentResult.
433 class LastCommonParentResult
{
439 public $splittingNeeded = false;
442 public $lastCommonParentDepth = -1;
445 public $indexInLastCommonParent = -1;
463 public $firstOfID = false;
467 function __construct($type) {
471 public static function typeToString($type) {
473 case self
::NONE
: return 'none';
474 case self
::REMOVED
: return 'removed';
475 case self
::ADDED
: return 'added';
476 case self
::CHANGED
: return 'changed';
481 class DomTreeBuilder
{
483 public $textNodes = array();
487 private $currentParent;
489 private $newWord = "";
491 protected $bodyStarted = false;
493 protected $bodyEnded = false;
495 private $whiteSpaceBeforeThis = false;
497 private $lastSibling;
499 private $notInPre = true;
501 function __construct() {
502 $this->bodyNode
= $this->currentParent
= new BodyNode();
503 $this->lastSibling
= new DummyNode();
507 * Must be called manually
509 public function endDocument() {
511 //wfDebug(count($this->textNodes) . ' text nodes in document.');
514 public function startElement($parser, $name, /*array*/ $attributes) {
515 if (strcasecmp($name, 'body') != 0) {
516 //wfDebug("Starting $name node.");
519 $newNode = new TagNode($this->currentParent
, $name, $attributes);
520 $this->currentParent
->children
[] = $newNode;
521 $this->currentParent
= $newNode;
522 $this->lastSibling
= new DummyNode();
523 if ($this->whiteSpaceBeforeThis
&& !in_array(strtolower($this->currentParent
->qName
),TagNode
::$blocks)) {
524 $this->currentParent
->whiteBefore
= true;
526 $this->whiteSpaceBeforeThis
= false;
527 if(strcasecmp($name, 'pre') == 0) {
528 $this->notInPre
= false;
533 public function endElement($parser, $name) {
534 if(strcasecmp($name, 'body') != 0) {
535 //wfDebug("Ending $name node.");
536 if (0 == strcasecmp($name,'img')) {
537 // Insert a dummy leaf for the image
538 $img = new ImageNode($this->currentParent
, $this->currentParent
->attributes
);
539 $this->currentParent
->children
[] = $img;
540 $img->whiteBefore
= $this->whiteSpaceBeforeThis
;
541 $this->lastSibling
= $img;
542 $this->textNodes
[] = $img;
545 if (!in_array(strtolower($this->currentParent
->qName
),TagNode
::$blocks)) {
546 $this->lastSibling
= $this->currentParent
;
548 $this->lastSibling
= new DummyNode();
550 $this->currentParent
= $this->currentParent
->parent
;
551 $this->whiteSpaceBeforeThis
= false;
552 if (!$this->notInPre
&& strcasecmp($name, 'pre') == 0) {
553 $this->notInPre
= true;
556 $this->endDocument();
560 const regex
= '/([\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1})/';
561 const whitespace
= '/^[\s]{1}$/';
562 const delimiter
= '/^[\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1}$/';
564 public function characters($parser, $data) {
565 $matches = preg_split(self
::regex
, $data, -1, PREG_SPLIT_DELIM_CAPTURE
);
567 foreach($matches as &$word) {
568 if (preg_match(self
::whitespace
, $word) && $this->notInPre
) {
570 $this->lastSibling
->whiteAfter
= true;
571 $this->whiteSpaceBeforeThis
= true;
572 } else if (preg_match(self
::delimiter
, $word)) {
574 $textNode = new TextNode($this->currentParent
, $word);
575 $this->currentParent
->children
[] = $textNode;
576 $textNode->whiteBefore
= $this->whiteSpaceBeforeThis
;
577 $this->whiteSpaceBeforeThis
= false;
578 $this->lastSibling
= $textNode;
579 $this->textNodes
[] = $textNode;
581 $this->newWord
.= $word;
586 private function endWord() {
587 if (!empty($this->newWord
)) {
588 $node = new TextNode($this->currentParent
, $this->newWord
);
589 $this->currentParent
->children
[] = $node;
590 $node->whiteBefore
= $this->whiteSpaceBeforeThis
;
591 $this->whiteSpaceBeforeThis
= false;
592 $this->lastSibling
= $node;
593 $this->textNodes
[] = $node;
598 public function getDiffLines() {
599 return array_map(array('TextNode','toDiffLine'), $this->textNodes
);
603 class TextNodeDiffer
{
608 private $oldTextNodes;
609 private $oldBodyNode;
611 private $lastModified = array();
615 private $changedID = 0;
617 private $changedIDUsed = false;
619 // used to remove the whitespace between a red and green block
620 private $whiteAfterLastChangedPart = false;
622 private $deletedID = 0;
624 function __construct(DomTreeBuilder
$tree, DomTreeBuilder
$oldTree) {
625 $this->textNodes
= $tree->textNodes
;
626 $this->bodyNode
= $tree->bodyNode
;
627 $this->oldTextNodes
= $oldTree->textNodes
;
628 $this->oldBodyNode
= $oldTree->bodyNode
;
631 public function markAsNew($start, $end) {
632 if ($end <= $start) {
636 if ($this->whiteAfterLastChangedPart
) {
637 $this->textNodes
[$start]->whiteBefore
= false;
640 $nextLastModified = array();
642 for ($i = $start; $i < $end; ++
$i) {
643 $mod = new Modification(Modification
::ADDED
);
644 $mod->id
= $this->newID
;
645 if (count($this->lastModified
) > 0) {
646 $mod->prevMod
= $this->lastModified
[0];
647 if (is_null($this->lastModified
[0]->nextMod
)) {
648 foreach ($this->lastModified
as &$lastMod) {
649 $lastMod->nextMod
= $mod;
653 $nextLastModified[] = $mod;
654 $this->textNodes
[$i]->modification
= $mod;
657 $this->textNodes
[$start]->modification
->firstOfID
= true;
660 $this->lastModified
= $nextLastModified;
663 public function handlePossibleChangedPart($leftstart, $leftend, $rightstart, $rightend) {
667 if ($this->changedIDUsed
) {
669 $this->changedIDUsed
= false;
672 $nextLastModified = array();
675 while ($i < $rightend) {
676 $acthis = new AncestorComparator($this->textNodes
[$i]->getParentTree());
677 $acother = new AncestorComparator($this->oldTextNodes
[$j]->getParentTree());
678 $result = $acthis->getResult($acother);
679 unset($acthis, $acother);
681 $nbLastModified = count($this->lastModified
);
682 if ($result->changed
) {
683 $mod = new Modification(Modification
::CHANGED
);
685 if (!$this->changedIDUsed
) {
686 $mod->firstOfID
= true;
687 if (count($nextLastModified) > 0) {
688 $this->lastModified
= $nextLastModified;
689 $nextLastModified = array();
691 } else if (!is_null($result->changes
) && $result->changes
!== $this->changes
) {
693 $mod->firstOfID
= true;
694 if (count($nextLastModified) > 0) {
695 $this->lastModified
= $nextLastModified;
696 $nextLastModified = array();
700 if ($nbLastModified > 0) {
701 $mod->prevMod
= $this->lastModified
[0];
702 if (is_null($this->lastModified
[0]->nextMod
)) {
703 foreach ($this->lastModified
as &$lastMod) {
704 $lastMod->nextMod
= $mod;
708 $nextLastModified[] = $mod;
710 $mod->changes
= $result->changes
;
711 $mod->id
= $this->changedID
;
713 $this->textNodes
[$i]->modification
= $mod;
714 $this->changes
= $result->changes
;
715 $this->changedIDUsed
= true;
716 } else if ($this->changedIDUsed
) {
718 $this->changedIDUsed
= false;
723 if (count($nextLastModified) > 0) {
724 $this->lastModified
= $nextLastModified;
728 public function markAsDeleted($start, $end, $before) {
730 if ($end <= $start) {
734 if ($before > 0 && $this->textNodes
[$before - 1]->whiteAfter
) {
735 $this->whiteAfterLastChangedPart
= true;
737 $this->whiteAfterLastChangedPart
= false;
740 $nextLastModified = array();
742 for ($i = $start; $i < $end; ++
$i) {
743 $mod = new Modification(Modification
::REMOVED
);
744 $mod->id
= $this->deletedID
;
745 if (count($this->lastModified
) > 0) {
746 $mod->prevMod
= $this->lastModified
[0];
747 if (is_null($this->lastModified
[0]->nextMod
)) {
748 foreach ($this->lastModified
as &$lastMod) {
749 $lastMod->nextMod
= $mod;
753 $nextLastModified[] = $mod;
755 // oldTextNodes is used here because we're going to move its deleted
756 // elements to this tree!
757 $this->oldTextNodes
[$i]->modification
= $mod;
759 $this->oldTextNodes
[$start]->modification
->firstOfID
= true;
761 $root = $this->oldTextNodes
[$start]->getLastCommonParent($this->oldTextNodes
[$end-1])->parent
;
763 $junk1 = $junk2 = null;
764 $deletedNodes = $root->getMinimalDeletedSet($this->deletedID
, $junk1, $junk2);
766 //wfDebug("Minimal set of deleted nodes of size " . count($deletedNodes));
768 // Set prevLeaf to the leaf after which the old HTML needs to be
771 $prevLeaf = $this->textNodes
[$before - 1];
773 // Set nextLeaf to the leaf before which the old HTML needs to be
775 if ($before < count($this->textNodes
)) {
776 $nextLeaf = $this->textNodes
[$before];
779 while (count($deletedNodes) > 0) {
780 if (isset($prevLeaf)) {
781 $prevResult = $prevLeaf->getLastCommonParent($deletedNodes[0]);
783 $prevResult = new LastCommonParentResult();
784 $prevResult->parent
= $this->bodyNode
;
785 $prevResult->indexInLastCommonParent
= 0;
787 if (isset($nextleaf)) {
788 $nextResult = $nextLeaf->getLastCommonParent($deletedNodes[count($deletedNodes) - 1]);
790 $nextResult = new LastCommonParentResult();
791 $nextResult->parent
= $this->bodyNode
;
792 $nextResult->indexInLastCommonParent
= $this->bodyNode
->getNbChildren();
795 if ($prevResult->lastCommonParentDepth
== $nextResult->lastCommonParentDepth
) {
796 // We need some metric to choose which way to add-...
797 if ($deletedNodes[0]->parent
=== $deletedNodes[count($deletedNodes) - 1]->parent
798 && $prevResult->parent
=== $nextResult->parent
) {
799 // The difference is not in the parent
800 $prevResult->lastCommonParentDepth
= $prevResult->lastCommonParentDepth +
1;
802 // The difference is in the parent, so compare them
803 // now THIS is tricky
804 $distancePrev = $deletedNodes[0]->parent
->getMatchRatio($prevResult->parent
);
805 $distanceNext = $deletedNodes[count($deletedNodes) - 1]->parent
->getMatchRatio($nextResult->parent
);
807 if ($distancePrev <= $distanceNext) {
808 $prevResult->lastCommonParentDepth
= $prevResult->lastCommonParentDepth +
1;
810 $nextResult->lastCommonParentDepth
= $nextResult->lastCommonParentDepth +
1;
816 if ($prevResult->lastCommonParentDepth
> $nextResult->lastCommonParentDepth
) {
817 // Inserting at the front
818 if ($prevResult->splittingNeeded
) {
819 $prevLeaf->parent
->splitUntil($prevResult->parent
, $prevLeaf, true);
821 $prevLeaf = $deletedNodes[0]->copyTree();
822 unset($deletedNodes[0]);
823 $deletedNodes = array_values($deletedNodes);
824 $prevLeaf->setParent($prevResult->parent
);
825 $prevResult->parent
->addChildAbsolute($prevLeaf,$prevResult->indexInLastCommonParent +
1);
826 } else if ($prevResult->lastCommonParentDepth
< $nextResult->lastCommonParentDepth
) {
827 // Inserting at the back
828 if ($nextResult->splittingNeeded
) {
829 $splitOccured = $nextLeaf->parent
->splitUntil($nextResult->parent
, $nextLeaf, false);
831 // The place where to insert is shifted one place to the
833 $nextResult->indexInLastCommonParent
= $nextResult->indexInLastCommonParent +
1;
836 $nextLeaf = $deletedNodes[count(deletedNodes
) - 1]->copyTree();
837 unset($deletedNodes[count(deletedNodes
) - 1]);
838 $deletedNodes = array_values($deletedNodes);
839 $nextLeaf->setParent($nextResult->parent
);
840 $nextResult->parent
->addChildAbsolute($nextLeaf,$nextResult->indexInLastCommonParent
);
842 throw new Exception("Uh?");
845 $this->lastModified
= $nextLastModified;
849 public function expandWhiteSpace() {
850 $this->bodyNode
->expandWhiteSpace();
853 public function lengthNew(){
854 return count($this->textNodes
);
857 public function lengthOld(){
858 return count($this->oldTextNodes
);
866 function __construct($output) {
867 $this->output
= $output;
870 function htmlDiff($from, $to) {
871 wfProfileIn( __METHOD__
);
872 // Create an XML parser
873 $xml_parser = xml_parser_create('');
875 $domfrom = new DomTreeBuilder();
877 // Set the functions to handle opening and closing tags
878 xml_set_element_handler($xml_parser, array($domfrom, "startElement"), array($domfrom, "endElement"));
880 // Set the function to handle blocks of character data
881 xml_set_character_data_handler($xml_parser, array($domfrom, "characters"));
883 //wfDebug('Parsing '.strlen($from)." characters worth of HTML\n");
884 if (!xml_parse($xml_parser, '<?xml version="1.0" encoding="UTF-8"?>'.Sanitizer
::hackDocType().'<body>', false)
885 ||
!xml_parse($xml_parser, $from, false)
886 ||
!xml_parse($xml_parser, '</body>', true)){
887 $error = xml_error_string(xml_get_error_code($xml_parser));
888 $line = xml_get_current_line_number($xml_parser);
889 wfDebug("XML error: $error at line $line\n");
891 xml_parser_free($xml_parser);
894 $xml_parser = xml_parser_create('');
896 $domto = new DomTreeBuilder();
898 // Set the functions to handle opening and closing tags
899 xml_set_element_handler($xml_parser, array($domto, "startElement"), array($domto, "endElement"));
901 // Set the function to handle blocks of character data
902 xml_set_character_data_handler($xml_parser, array($domto, "characters"));
904 //wfDebug('Parsing '.strlen($to)." characters worth of HTML\n");
905 if (!xml_parse($xml_parser, '<?xml version="1.0" encoding="UTF-8"?>'.Sanitizer
::hackDocType().'<body>', false)
906 ||
!xml_parse($xml_parser, $to, false)
907 ||
!xml_parse($xml_parser, '</body>', true)){
908 $error = xml_error_string(xml_get_error_code($xml_parser));
909 $line = xml_get_current_line_number($xml_parser);
910 wfDebug("XML error: $error at line $line\n");
912 xml_parser_free($xml_parser);
915 $diffengine = new WikiDiff3();
916 $differences = $this->preProcess($diffengine->diff_range($domfrom->getDiffLines(), $domto->getDiffLines()));
917 unset($xml_parser, $diffengine);
919 $domdiffer = new TextNodeDiffer($domto, $domfrom);
921 $currentIndexLeft = 0;
922 $currentIndexRight = 0;
923 foreach ($differences as &$d) {
924 if ($d->leftstart
> $currentIndexLeft) {
925 $domdiffer->handlePossibleChangedPart($currentIndexLeft, $d->leftstart
,
926 $currentIndexRight, $d->rightstart
);
928 if ($d->leftlength
> 0) {
929 $domdiffer->markAsDeleted($d->leftstart
, $d->leftend
, $d->rightstart
);
931 $domdiffer->markAsNew($d->rightstart
, $d->rightend
);
933 $currentIndexLeft = $d->leftend
;
934 $currentIndexRight = $d->rightend
;
936 $oldLength = $domdiffer->lengthOld();
937 if ($currentIndexLeft < $oldLength) {
938 $domdiffer->handlePossibleChangedPart($currentIndexLeft, $oldLength, $currentIndexRight, $domdiffer->lengthNew());
940 $domdiffer->expandWhiteSpace();
941 $output = new HTMLOutput('htmldiff', $this->output
);
942 $output->parse($domdiffer->bodyNode
);
943 wfProfileOut( __METHOD__
);
946 private function preProcess(/*array*/ $differences) {
947 $newRanges = array();
949 $nbDifferences = count($differences);
950 for ($i = 0; $i < $nbDifferences; ++
$i) {
951 $leftStart = $differences[$i]->leftstart
;
952 $leftEnd = $differences[$i]->leftend
;
953 $rightStart = $differences[$i]->rightstart
;
954 $rightEnd = $differences[$i]->rightend
;
956 $leftLength = $leftEnd - $leftStart;
957 $rightLength = $rightEnd - $rightStart;
959 while ($i +
1 < $nbDifferences && self
::score($leftLength,
960 $differences[$i +
1]->leftlength
,
962 $differences[$i +
1]->rightlength
)
963 > ($differences[$i +
1]->leftstart
- $leftEnd)) {
964 $leftEnd = $differences[$i +
1]->leftend
;
965 $rightEnd = $differences[$i +
1]->rightend
;
966 $leftLength = $leftEnd - $leftStart;
967 $rightLength = $rightEnd - $rightStart;
970 $newRanges[] = new RangeDifference($leftStart, $leftEnd, $rightStart, $rightEnd);
976 * Heuristic to merge differences for readability.
978 public static function score($ll, $nll, $rl, $nrl) {
979 if (($ll == 0 && $nll == 0)
980 ||
($rl == 0 && $nrl == 0)) {
983 $numbers = array($ll, $nll, $rl, $nrl);
985 foreach ($numbers as &$number) {
986 while ($number > 3) {
994 return $d / (1.5 * count($numbers));
999 class TextOnlyComparator
{
1001 public $leafs = array();
1003 function _construct(TagNode
$tree) {
1004 $this->addRecursive($tree);
1005 $this->leafs
= array_map(array('TextNode','toDiffLine'), $this->leafs
);
1008 private function addRecursive(TagNode
$tree) {
1009 foreach ($tree->children
as &$child) {
1010 if ($child instanceof TagNode
) {
1011 $this->addRecursive($child);
1012 } else if ($child instanceof TextNode
) {
1013 $this->leafs
[] = $node;
1018 public function getMatchRatio(TextOnlyComparator
$other) {
1019 $nbOthers = count($other->leafs
);
1020 $nbThis = count($this->leafs
);
1021 if($nbOthers == 0 ||
$nbThis == 0){
1025 $diffengine = new WikiDiff3(25000, 1.35);
1026 $diffengine->diff($this->leafs
, $other->leafs
);
1028 $lcsLength = $diffengine->getLcsLength();
1030 $distanceThis = $nbThis-$lcsLength;
1032 return (2.0 - $lcsLength/$nbOthers - $lcsLength/$nbThis) / 2.0;
1036 class AncestorComparatorResult
{
1038 public $changed = false;
1040 public $changes = "";
1044 * A comparator used when calculating the difference in ancestry of two Nodes.
1046 class AncestorComparator
{
1049 public $ancestorsText;
1051 function __construct(/*array*/ $ancestors) {
1052 $this->ancestors
= $ancestors;
1053 $this->ancestorsText
= array_map(array('TagNode','toDiffLine'), $ancestors);
1056 public $compareTxt = "";
1058 public function getResult(AncestorComparator
$other) {
1059 $result = new AncestorComparatorResult();
1061 $diffengine = new WikiDiff3(10000, 1.35);
1062 $differences = $diffengine->diff_range($this->ancestorsText
, $other->ancestorsText
);
1064 if (count($differences) == 0){
1067 $changeTxt = new ChangeTextGenerator($this, $other);
1069 $result->changed
= true;
1070 $result->changes
= $changeTxt->getChanged($differences)->toString();
1076 class ChangeTextGenerator
{
1083 function __construct(AncestorComparator
$old, AncestorComparator
$new) {
1086 $this->factory
= new TagToStringFactory();
1089 public function getChanged(/*array*/ $differences) {
1090 $txt = new ChangeText
;
1092 $rootlistopened = false;
1094 if (count($differences) > 1) {
1095 $txt->addHtml('<ul class="changelist">');
1096 $rootlistopened = true;
1099 $nbDifferences = count($differences);
1100 for ($j = 0; $j < $nbDifferences; ++
$j) {
1101 $d = $differences[$j];
1103 $lvl1listopened = false;
1105 if ($rootlistopened) {
1106 $txt->addHtml('<li>');
1109 if ($d->leftlength +
$d->rightlength
> 1) {
1110 $txt->addHtml('<ul class="changelist">');
1111 $lvl1listopened = true;
1114 // left are the old ones
1115 for ($i = $d->leftstart
; $i < $d->leftend
; ++
$i) {
1116 if ($lvl1listopened){
1117 $txt->addHtml('<li>');
1119 // add a bullet for a old tag
1120 $this->addTagOld($txt, $this->old
->ancestors
[$i]);
1122 if ($lvl1listopened){
1123 $txt->addHtml('</li>');
1127 // right are the new ones
1128 for ($i = $d->rightstart
; $i < $d->rightend
; ++
$i) {
1129 if ($lvl1listopened){
1130 $txt->addHtml('<li>');
1133 // add a bullet for a new tag
1134 $this->addTagNew($txt, $this->new->ancestors
[$i]);
1136 if ($lvl1listopened){
1137 $txt->addHtml('</li>');
1142 if ($lvl1listopened) {
1143 $txt->addHtml('</ul>');
1146 if ($rootlistopened) {
1147 $txt->addHtml('</li>');
1151 if ($rootlistopened) {
1152 $txt->addHtml('</ul>');
1159 private function addTagOld(ChangeText
$txt, TagNode
$ancestor) {
1160 $this->factory
->create($ancestor)->getRemovedDescription($txt);
1163 private function addTagNew(ChangeText
$txt, TagNode
$ancestor) {
1164 $this->factory
->create($ancestor)->getAddedDescription($txt);
1172 const newLine
= "<br/>";
1174 public function addText($s) {
1175 $s = $this->clean($s);
1179 public function addHtml($s) {
1183 public function addNewLine() {
1184 $this->addHtml(self
::newLine
);
1187 public function toString() {
1191 private function clean($s) {
1192 return htmlspecialchars($s);
1196 class TagToStringFactory
{
1198 private static $containerTags = array('html', 'body', 'p', 'blockquote',
1199 'h1', 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol', 'li',
1200 'table', 'tbody', 'tr', 'td', 'th', 'br', 'hr', 'code', 'dl',
1201 'dt', 'dd', 'input', 'form', 'img', 'span', 'a');
1203 private static $styleTags = array('i', 'b', 'strong', 'em', 'font',
1204 'big', 'del', 'tt', 'sub', 'sup', 'strike');
1210 public function create(TagNode
$node) {
1211 $sem = $this->getChangeSemantic($node->qName
);
1212 if (strcasecmp($node->qName
,'a') == 0) {
1213 return new AnchorToString($node, $sem);
1215 if (strcasecmp($node->qName
,'img') == 0) {
1216 return new NoContentTagToString($node, $sem);
1218 return new TagToString($node, $sem);
1221 protected function getChangeSemantic($qname) {
1222 if (in_array(strtolower($qname),self
::$containerTags)) {
1225 if (in_array(strtolower($qname),self
::$styleTags)) {
1228 return self
::UNKNOWN
;
1238 function __construct(TagNode
$node, $sem) {
1239 $this->node
= $node;
1243 public function getDescription() {
1244 return $this->getString('diff-' . $this->node
->qName
);
1247 public function getRemovedDescription(ChangeText
$txt) {
1248 if ($this->sem
== TagToStringFactory
::MOVED
) {
1249 $txt->addText($this->getMovedOutOf() . ' ' . strtolower($this->getArticle()) . ' ');
1250 $txt->addHtml('<b>');
1251 $txt->addText(strtolower($this->getDescription()));
1252 $txt->addHtml('</b>');
1253 } else if ($this->sem
== TagToStringFactory
::STYLE
) {
1254 $txt->addHtml('<b>');
1255 $txt->addText($this->getDescription());
1256 $txt->addHtml('</b>');
1257 $txt->addText(' ' . strtolower($this->getStyleRemoved()));
1259 $txt->addHtml('<b>');
1260 $txt->addText($this->getDescription());
1261 $txt->addHtml('</b>');
1262 $txt->addText(' ' . strtolower($this->getRemoved()));
1264 $this->addAttributes($txt, $this->node
->attributes
);
1268 public function getAddedDescription(ChangeText
$txt) {
1269 if ($this->sem
== TagToStringFactory
::MOVED
) {
1270 $txt->addText($this->getMovedTo() . ' ' . strtolower($this->getArticle()) . ' ');
1271 $txt->addHtml('<b>');
1272 $txt->addText(strtolower($this->getDescription()));
1273 $txt->addHtml('</b>');
1274 } else if ($this->sem
== TagToStringFactory
::STYLE
) {
1275 $txt->addHtml('<b>');
1276 $txt->addText($this->getDescription());
1277 $txt->addHtml('</b>');
1278 $txt->addText(' ' . strtolower($this->getStyleAdded()));
1280 $txt->addHtml('<b>');
1281 $txt->addText($this->getDescription());
1282 $txt->addHtml('</b>');
1283 $txt->addText(' ' . strtolower($this->getAdded()));
1285 $this->addAttributes($txt, $this->node
->attributes
);
1289 protected function getMovedTo() {
1290 return $this->getString('diff-movedto');
1293 protected function getStyleAdded() {
1294 return $this->getString('diff-styleadded');
1297 protected function getAdded() {
1298 return $this->getString('diff-added');
1301 protected function getMovedOutOf() {
1302 return $this->getString('diff-movedoutof');
1305 protected function getStyleRemoved() {
1306 return $this->getString('diff-styleremoved');
1309 protected function getRemoved() {
1310 return $this->getString('diff-removed');
1313 protected function addAttributes(ChangeText
$txt, array $attributes) {
1314 if (count($attributes) < 1) {
1320 foreach ($attributes as $key => &$attr) {
1324 $txt->addText(' ' . strtolower($this->getWith())
1325 . ' ' . $this->translateArgument($key) . ' '
1329 $txt->addText(', ' . $this->translateArgument($key) . ' '
1333 if (count($attributes) > 1) {
1335 . strtolower($this->getAnd())
1337 . $this->translateArgument($lastKey) . ' '
1338 . $attributes[$lastKey]);
1342 private function getAnd() {
1343 return $this->getString('diff-and');
1346 private function getWith() {
1347 return $this->getString('diff-with');
1350 protected function translateArgument($name) {
1351 if (strcasecmp($name, 'src') == 0) {
1352 return strtolower($this->getSource());
1354 if (strcasecmp($name, 'width') == 0) {
1355 return strtolower($this->getWidth());
1357 if (strcasecmp($name, 'height') == 0) {
1358 return strtolower($this->getHeight());
1363 private function getHeight() {
1364 return $this->getString('diff-height');
1367 private function getWidth() {
1368 return $this->getString('diff-width');
1371 protected function getSource() {
1372 return $this->getString('diff-source');
1375 protected function getArticle() {
1376 return $this->getString('diff-' . $this->node
->qName
. '-article');
1379 // FIXME: Use messages for this
1380 public static $bundle = array(
1381 'diff-movedto' => 'Moved to',
1382 'diff-styleadded' => 'Style added',
1383 'diff-added' => 'Added',
1384 'diff-changedto' => 'Changed to',
1385 'diff-movedoutof' => 'Moved out of',
1386 'diff-styleremoved' => 'Style removed',
1387 'diff-removed' => 'Removed',
1388 'diff-changedfrom' => 'Changed from',
1389 'diff-source' => 'Source',
1390 'diff-withdestination' => 'With destination',
1391 'diff-and' => 'And',
1392 'diff-with' => 'With',
1393 'diff-width' => 'Width',
1394 'diff-height' => 'Height',
1395 'diff-html-article' => 'A',
1396 'diff-html' => 'Html page',
1397 'diff-body-article' => 'A',
1398 'diff-body' => 'Html document',
1399 'diff-p-article' => 'A',
1400 'diff-p' => 'Paragraph',
1401 'diff-blockquote-article' => 'A',
1402 'diff-blockquote' => 'Quote',
1403 'diff-h1-article' => 'A',
1404 'diff-h1' => 'Heading (level 1)',
1405 'diff-h2-article' => 'A',
1406 'diff-h2' => 'Heading (level 2)',
1407 'diff-h3-article' => 'A',
1408 'diff-h3' => 'Heading (level 3)',
1409 'diff-h4-article' => 'A',
1410 'diff-h4' => 'Heading (level 4)',
1411 'diff-h5-article' => 'A',
1412 'diff-h5' => 'Heading (level 5)',
1413 'diff-pre-article' => 'A',
1414 'diff-pre' => 'Preformatted block',
1415 'diff-div-article' => 'A',
1416 'diff-div' => 'Division',
1417 'diff-ul-article' => 'An',
1418 'diff-ul' => 'Unordered list',
1419 'diff-ol-article' => 'An',
1420 'diff-ol' => 'Ordered list',
1421 'diff-li-article' => 'A',
1422 'diff-li' => 'List item',
1423 'diff-table-article' => 'A',
1424 'diff-table' => 'Table',
1425 'diff-tbody-article' => 'A',
1426 'diff-tbody' => "Table's content",
1427 'diff-tr-article' => 'A',
1429 'diff-td-article' => 'A',
1430 'diff-td' => 'Cell',
1431 'diff-th-article' => 'A',
1432 'diff-th' => 'Header',
1433 'diff-br-article' => 'A',
1434 'diff-br' => 'Break',
1435 'diff-hr-article' => 'A',
1436 'diff-hr' => 'Horizontal rule',
1437 'diff-code-article' => 'A',
1438 'diff-code' => 'Computer code block',
1439 'diff-dl-article' => 'A',
1440 'diff-dl' => 'Definition list',
1441 'diff-dt-article' => 'A',
1442 'diff-dt' => 'Definition term',
1443 'diff-dd-article' => 'A',
1444 'diff-dd' => 'Definition',
1445 'diff-input-article' => 'An',
1446 'diff-input' => 'Input',
1447 'diff-form-article' => 'A',
1448 'diff-form' => 'Form',
1449 'diff-img-article' => 'An',
1450 'diff-img' => 'Image',
1451 'diff-span-article' => 'A',
1452 'diff-span' => 'Span',
1453 'diff-a-article' => 'A',
1455 'diff-i' => 'Italics',
1457 'diff-strong' => 'Strong',
1458 'diff-em' => 'Emphasis',
1459 'diff-font' => 'Font',
1460 'diff-big' => 'Big',
1461 'diff-del' => 'Deleted',
1462 'diff-tt' => 'Fixed width',
1463 'diff-sub' => 'Subscript',
1464 'diff-sup' => 'Superscript',
1465 'diff-strike' => 'Strikethrough'
1468 public function getString($key) {
1469 return self
::$bundle[$key];
1473 class NoContentTagToString
extends TagToString
{
1475 function __construct(TagNode
$node, $sem) {
1476 parent
::__construct($node, $sem);
1479 public function getAddedDescription(ChangeText
$txt) {
1480 $txt.addText($this->getChangedTo() . ' ' +
strtolower($this->getArticle()) . ' ');
1481 $txt.addHtml('<b>');
1482 $txt.addText(strtolower($this->getDescription()));
1483 $txt.addHtml('</b>');
1485 $this->addAttributes($txt, $this->node
->attributes
);
1489 private function getChangedTo() {
1490 return $this->getString('diff-changedto');
1493 public function getRemovedDescription(ChangeText
$txt) {
1494 $txt.addText($this->getChangedFrom() . ' ' +
strtolower($this->getArticle()) . ' ');
1495 $txt.addHtml('<b>');
1496 $txt.addText(strtolower($this->getDescription()));
1497 $txt.addHtml('</b>');
1499 $this->addAttributes($txt, $this->node
->attributes
);
1503 private function getChangedFrom() {
1504 return $this->getString('diff-changedfrom');
1508 class AnchorToString
extends TagToString
{
1510 function __construct(TagNode
$node, $sem) {
1511 parent
::__construct($node, $sem);
1514 protected function addAttributes(ChangeText
$txt, array $attributes) {
1515 if (array_key_exists('href', $attributes)) {
1516 $txt->addText(' ' . strtolower($this->getWithDestination()) . ' ' . $attributes['href']);
1517 unset($attributes['href']);
1519 parent
::addAttributes($txt, $attributes);
1522 private function getWithDestination() {
1523 return $this->getString('diff-withdestination');
1529 * Takes a branch root and creates an HTML file for it.
1536 function __construct($prefix, $handler) {
1537 $this->prefix
= $prefix;
1538 $this->handler
= $handler;
1541 public function parse(TagNode
$node) {
1542 $handler = &$this->handler
;
1544 if (strcasecmp($node->qName
, 'img') != 0 && strcasecmp($node->qName
, 'body') != 0) {
1545 $handler->startElement($node->qName
, $node->attributes
);
1548 $newStarted = false;
1549 $remStarted = false;
1550 $changeStarted = false;
1553 foreach ($node->children
as &$child) {
1554 if ($child instanceof TagNode
) {
1556 $handler->endElement('span');
1557 $newStarted = false;
1558 } else if ($changeStarted) {
1559 $handler->endElement('span');
1560 $changeStarted = false;
1561 } else if ($remStarted) {
1562 $handler->endElement('span');
1563 $remStarted = false;
1565 $this->parse($child);
1566 } else if ($child instanceof TextNode
) {
1567 $mod = $child->modification
;
1569 if ($newStarted && ($mod->type
!= Modification
::ADDED ||
$mod->firstOfID
)) {
1570 $handler->endElement('span');
1571 $newStarted = false;
1572 } else if ($changeStarted && ($mod->type
!= Modification
::CHANGED
1573 ||
$mod->changes
!= $changeTXT ||
$mod->firstOfID
)) {
1574 $handler->endElement('span');
1575 $changeStarted = false;
1576 } else if ($remStarted && ($mod->type
!= Modification
::REMOVED ||
$mod ->firstOfID
)) {
1577 $handler->endElement('span');
1578 $remStarted = false;
1581 // no else because a removed part can just be closed and a new
1583 if (!$newStarted && $mod->type
== Modification
::ADDED
) {
1584 $attrs = array('class' => 'diff-html-added');
1585 if ($mod->firstOfID
) {
1586 $attrs['id'] = "added-{$this->prefix}-{$mod->id}";
1588 $this->addAttributes($mod, $attrs);
1589 $attrs['onclick'] = 'return tipA(constructToolTipA(this));';
1590 $handler->startElement('span', $attrs);
1592 } else if (!$changeStarted && $mod->type
== Modification
::CHANGED
) {
1593 $attrs = array('class' => 'diff-html-changed');
1594 if ($mod->firstOfID
) {
1595 $attrs['id'] = "changed-{$this->prefix}-{$mod->id}";
1597 $this->addAttributes($mod, $attrs);
1598 $attrs['onclick'] = 'return tipC(constructToolTipC(this));';
1599 $handler->startElement('span', $attrs);
1602 $handler->startElement('span', array('class' => 'tip'));
1603 $handler->characters($mod->changes
);
1604 $handler->endElement('span');
1606 $changeStarted = true;
1607 $changeTXT = $mod->changes
;
1608 } else if (!$remStarted && $mod->type
== Modification
::REMOVED
) {
1609 $attrs = array('class'=>'diff-html-removed');
1610 if ($mod->firstOfID
) {
1611 $attrs['id'] = "removed-{$this->prefix}-{$mod->id}";
1613 $this->addAttributes($mod, $attrs);
1614 $attrs['onclick'] = 'return tipR(constructToolTipR(this));';
1615 $handler->startElement('span', $attrs);
1619 $chars = $child->text
;
1621 if ($child instanceof ImageNode
) {
1622 $this->writeImage($child);
1624 $handler->characters($chars);
1631 $handler->endElement('span');
1632 $newStarted = false;
1633 } else if ($changeStarted) {
1634 $handler->endElement('span');
1635 $changeStarted = false;
1636 } else if ($remStarted) {
1637 $handler->endElement('span');
1638 $remStarted = false;
1641 if (strcasecmp($node->qName
, 'img') != 0
1642 && strcasecmp($node->qName
, 'body') != 0) {
1643 $handler->endElement($node->qName
);
1647 private function writeImage(ImageNode
$imgNode) {
1648 $attrs = $imgNode->attributes
;
1649 if ($imgNode->modification
->type
== Modification
::REMOVED
) {
1650 $attrs['changeType']='diff-removed-image';
1651 } else if ($imgNode->modification
->type
== Modification
::ADDED
) {
1652 $attrs['changeType'] = 'diff-added-image';
1654 $attrs['onload'] = 'updateOverlays()';
1655 $attrs['onError'] = 'updateOverlays()';
1656 $attrs['onAbort'] = 'updateOverlays()';
1657 $this->handler
->startElement('img', $attrs);
1658 $this->handler
->endElement('img');
1661 private function addAttributes(Modification
$mod, /*array*/ &$attrs) {
1662 if (is_null($mod->prevMod
)) {
1663 $previous = 'first-' . $this->prefix
;
1665 $type = Modification
::typeToString($mod->prevMod
->type
);
1666 $previous = "$type-{$this->prefix}-{$mod->prevMod->id}";
1668 $attrs['previous'] = $previous;
1670 $type = Modification
::typeToString($mod->type
);
1671 $changeId = "$type-{$this->prefix}-{$mod->id}";
1672 $attrs['changeId'] = $changeId;
1674 if (is_null($mod->nextMod
)) {
1675 $next = "last-{$this->prefix}";
1677 $type = Modification
::typeToString($mod->nextMod
->type
);
1678 $next = "$type-{$this->prefix}-{$mod->nextMod->id}";
1680 $attrs['next'] = $next;
1684 class EchoingContentHandler
{
1686 function startElement($qname, /*array*/ $arguments) {
1687 echo Xml
::openElement($qname, $arguments);
1690 function endElement($qname){
1691 echo Xml
::closeElement($qname);
1694 function characters($chars){
1700 class DelegatingContentHandler
{
1704 function __construct($delegate) {
1705 $this->delegate
= $delegate;
1708 function startElement($qname, /*array*/ $arguments) {
1709 $this->delegate
->addHtml(Xml
::openElement($qname, $arguments));
1712 function endElement($qname){
1713 $this->delegate
->addHtml(Xml
::closeElement($qname));
1716 function characters($chars){
1717 $this->delegate
->addHtml($chars);