From 3712ab270db601ad79f3b138ab43fd70fb6411e4 Mon Sep 17 00:00:00 2001 From: Guy Van den Broeck Date: Mon, 18 Aug 2008 00:16:32 +0000 Subject: [PATCH] Performance improvements to diff algorithms --- includes/AutoLoader.php | 3 +- includes/Diff.php | 179 ++++--- includes/DifferenceEngine.php | 73 +-- includes/HTMLDiff.php | 868 ++++++++++++---------------------- 4 files changed, 443 insertions(+), 680 deletions(-) diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index ab42a19b74..c8be601472 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -154,7 +154,7 @@ $wgAutoloadLocalClasses = array( 'ProtectionForm' => 'includes/ProtectionForm.php', 'QueryPage' => 'includes/QueryPage.php', 'QuickTemplate' => 'includes/SkinTemplate.php', - 'RangeDifference' => 'includes/DifferenceEngine.php', + 'RangeDifference' => 'includes/Diff.php', 'RawPage' => 'includes/RawPage.php', 'RCCacheEntry' => 'includes/ChangesList.php', 'RecentChange' => 'includes/RecentChange.php', @@ -212,6 +212,7 @@ $wgAutoloadLocalClasses = array( 'WatchlistEditor' => 'includes/WatchlistEditor.php', 'WebRequest' => 'includes/WebRequest.php', 'WebResponse' => 'includes/WebResponse.php', + 'WikiDiff3' => 'includes/Diff.php', 'WikiError' => 'includes/WikiError.php', 'WikiErrorMsg' => 'includes/WikiError.php', 'WikiExporter' => 'includes/Export.php', diff --git a/includes/Diff.php b/includes/Diff.php index df8293b5a0..e6b430e24d 100644 --- a/includes/Diff.php +++ b/includes/Diff.php @@ -43,32 +43,35 @@ class WikiDiff3{ //State variables private $maxDifferences; - + private $lcsLengthCorrectedForHeuristic = false; + //Output variables public $length; - public $added; public $removed; + public $added; public $heuristicUsed; - + function __construct($tooLong = 2000000, $powLimit = 1.45){ $this->tooLong = $tooLong; $this->powLimit = $powLimit; } public function diff(/*array*/ $from, /*array*/ $to){ + //remember initial lengths $m = sizeof($from); $n = sizeof($to); - + $this->heuristicUsed = FALSE; - $result_from = $m>0?array_fill(0,$m,true):array(); - $result_to = $n>0?array_fill(0,$n,true):array(); + //output + $removed = $m>0?array_fill(0,$m,true):array(); + $added = $n>0?array_fill(0,$n,true):array(); //reduce the complexity for the next step (intentionally done twice) //remove common tokens at the start $i=0; while($i<$m && $i<$n && $from[$i]===$to[$i]){ - $result_from[$i] = $result_to[$i] = false; + $removed[$i] = $added[$i] = false; unset($from[$i],$to[$i]); ++$i; } @@ -76,12 +79,12 @@ class WikiDiff3{ //remove common tokens at the end $j=1; while($i+$j<=$m && $i+$j<=$n && $from[$m-$j]===$to[$n-$j]){ - $result_from[$m-$j] = $result_to[$n-$j] = false; + $removed[$m-$j] = $added[$n-$j] = false; unset($from[$m-$j],$to[$n-$j]); ++$j; } - $newFrom = $newFromIndex = $newTo = $newToIndex = array(); + $this->from = $newFromIndex = $this->to = $newToIndex = array(); //remove tokens not in both sequences $shared = array_fill_keys($from,false); @@ -101,71 +104,100 @@ class WikiDiff3{ } } - unset($shared); + unset($shared, $from, $to); $this->m = sizeof($this->from); $this->n = sizeof($this->to); + $this->removed = $this->m>0?array_fill(0,$this->m,true):array(); $this->added = $this->n>0?array_fill(0,$this->n,true):array(); - $this->longestCommonSubsequence(); + if ($this->m == 0 || $this->n == 0) { + $this->length = 0; + }else{ + $this->maxDifferences = ceil(($this->m + $this->n) / 2.0); + if ($this->m * $this->n > $this->tooLong) { + // limit complexity to D^POW_LIMIT for long sequences + $this->maxDifferences = floor(pow($this->maxDifferences, $this->powLimit - 1.0)); + wfDebug("Limiting max number of differences to $this->maxDifferences\n"); + } + + /* + * The common prefixes and suffixes are always part of some LCS, include + * them now to reduce our search space + */ + $max = min($this->m, $this->n); + for ($forwardBound = 0; $forwardBound < $max + && $this->from[$forwardBound]===$this->to[$forwardBound]; ++$forwardBound) { + $this->removed[$forwardBound] = $this->added[$forwardBound] = false; + } + + $backBoundL1 = $this->m - 1; + $backBoundL2 = $this->n - 1; + + while ($backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound + && $this->from[$backBoundL1] === $this->to[$backBoundL2]) { + $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false; + } + + $temp = array_fill(0,$this->m + $this->n + 1, 0); + $V = array($temp,$temp); + $snake = array(0,0,0); + + $this->length = $forwardBound + $this->m - $backBoundL1 - 1 + + $this->lcs_rec($forwardBound, $backBoundL1, $forwardBound, $backBoundL2, + $V, $snake); + } $this->m = $m; $this->n = $n; + $this->length += $i+$j-1; - foreach($this->removed as $key => $removed){ - if(!$removed){ - $result_from[$newFromIndex[$key]] = false; + foreach($this->removed as $key => $removed_elem){ + if(!$removed_elem){ + $removed[$newFromIndex[$key]] = false; } } - foreach($this->added as $key => $added){ - if(!$added){ - $result_to[$newToIndex[$key]] = false; + foreach($this->added as $key => $added_elem){ + if(!$added_elem){ + $added[$newToIndex[$key]] = false; } } - unset($this->added, $this->removed); - return array($result_from, $result_to); + $this->removed = $removed; + $this->added = $added; } - public function longestCommonSubsequence() { - if ($this->m == 0 || $this->n == 0) { - $this->length = 0; - return; - } - - $this->maxDifferences = ceil(($this->m + $this->n) / 2.0); - if ($this->m * $this->n > $this->tooLong) { - // limit complexity to D^POW_LIMIT for long sequences - $this->maxDifferences = floor(pow($this->maxDifferences, $this->powLimit - 1.0)); - wfDebug("Limiting max number of differences to $this->maxDifferences"); - } - - /* - * The common prefixes and suffixes are always part of some LCS, include - * them now to reduce our search space - */ - $max = min($this->m, $this->n); - for ($forwardBound = 0; $forwardBound < $max - && $this->from[$forwardBound]===$this->to[$forwardBound]; ++$forwardBound) { - $this->removed[$forwardBound] = $this->added[$forwardBound] = false; - } + function diff_range ($from_lines, $to_lines){ + // Diff and store locally + $this->diff($from_lines, $to_lines); + unset($from_lines, $to_lines); + + $ranges = array(); + $xi = $yi = 0; + while ($xi < $this->m || $yi < $this->n) { + // Matching "snake". + while ( $xi < $this->m && $yi < $this->n + && !$this->removed[$xi] && !$this->added[$yi]) { + ++$xi; + ++$yi; + } + // Find deletes & adds. + $xstart = $xi; + while ($xi < $this->m && $this->removed[$xi]){ + ++$xi; + } - $backBoundL1 = $this->m - 1; - $backBoundL2 = $this->n - 1; + $ystart = $yi; + while ($yi < $this->n && $this->added[$yi]){ + ++$yi; + } - while ($backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound - && $this->from[$backBoundL1] === $this->to[$backBoundL2]) { - $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false; + if ($xi>$xstart || $yi>$ystart){ + $ranges[] = new RangeDifference($xstart,$xi,$ystart,$yi); + } } - - $temp = array_fill(0,$this->m + $this->n + 1, 0); - $V = array($temp,$temp); - $snake = array(0,0,0); - - $this->length = $forwardBound + $this->m - $backBoundL1 - 1 - + $this->lcs_rec($forwardBound, $backBoundL1, $forwardBound, $backBoundL2, - $V, $snake); + return $ranges; } private function lcs_rec($bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake) { @@ -267,13 +299,13 @@ class WikiDiff3{ $absx = $snake0 = $x + $bottoml1; $absy = $snake1 = $x - $k + $bottoml2; - + while ($absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy]) { ++$absx; ++$absy; } $x = $absx-$bottoml1; - + $snake2 = $absx -$snake0; $V0[$limit + $k] = $x; if ($k >= $delta - $d + 1 && $k <= $delta + $d - 1 @@ -340,7 +372,7 @@ class WikiDiff3{ $absx = $snake0 = $x + $bottoml1; $absy = $snake1 = $x - $k + $bottoml2; - + while ($absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy]) { ++$absx; ++$absy; @@ -410,7 +442,7 @@ class WikiDiff3{ $snake0 = $bottoml1 + $most_progress[0]; $snake1 = $bottoml2 + $most_progress[1]; $snake2 = 0; - wfDebug('Computing the LCS is too expensive. Using a heuristic.'); + wfDebug('Computing the LCS is too expensive. Using a heuristic.\n'); $this->heuristicUsed = true; return 5; /* * HACK: since we didn't really finish the LCS computation @@ -501,5 +533,36 @@ class WikiDiff3{ return $max_progress[floor($num_progress / 2)]; } + public function getLcsLength(){ + if($this->heuristicUsed && !$this->lcsLengthCorrectedForHeuristic){ + $this->lcsLengthCorrectedForHeuristic = true; + $this->length = $this->m-array_sum($this->added); + } + return $this->length; + } + } +/** + * Alternative representation of a set of changes, by the index + * ranges that are changed. + */ +class RangeDifference { + + public $leftstart; + public $leftend; + public $leftlength; + + public $rightstart; + public $rightend; + public $rightlength; + + function __construct($leftstart, $leftend, $rightstart, $rightend){ + $this->leftstart = $leftstart; + $this->leftend = $leftend; + $this->leftlength = $leftend-$leftstart; + $this->rightstart = $rightstart; + $this->rightend = $rightend; + $this->rightlength = $rightend-$rightstart; + } +} diff --git a/includes/DifferenceEngine.php b/includes/DifferenceEngine.php index a49fdba031..d3bf205be6 100644 --- a/includes/DifferenceEngine.php +++ b/includes/DifferenceEngine.php @@ -376,6 +376,8 @@ CONTROL; $newHtml = $parserOutput->getText(); wfRunHooks( 'OutputPageBeforeHTML',array( &$wgOut, &$newHtml ) ); + unset($parserOutput,$popts); + $differ = new HTMLDiffer(new DelegatingContentHandler($wgOut)); $differ->htmlDiff($oldHtml, $newHtml); @@ -955,30 +957,6 @@ class _DiffOp_Change extends _DiffOp { } } -/** - * Alternative representation of a set of changes, by the index - * ranges that are changed. - */ -class RangeDifference { - - public $leftstart; - public $leftend; - public $leftlength; - - public $rightstart; - public $rightend; - public $rightlength; - - function __construct($leftstart, $leftend, $rightstart, $rightend){ - $this->leftstart = $leftstart; - $this->leftend = $leftend; - $this->leftlength = $leftend-$leftstart; - $this->rightstart = $rightstart; - $this->rightend = $rightend; - $this->rightlength = $rightend-$rightstart; - } -} - /** * Class used internally by Diff to actually compute the diffs. * @@ -1057,54 +1035,17 @@ class _DiffEngine { return $edits; } - function diff_range ($from_lines, $to_lines){ - wfProfileIn( __METHOD__ ); - - // Diff and store locally - $this->diff_local($from_lines, $to_lines); - - // Compute the ranges operations. - $n_from = sizeof($from_lines); - $n_to = sizeof($to_lines); - - $ranges = array(); - $xi = $yi = 0; - while ($xi < $n_from || $yi < $n_to) { - // Matching "snake". - while ( $xi < $n_from && $yi < $n_to - && !$this->xchanged[$xi] && !$this->ychanged[$yi]) { - ++$xi; - ++$yi; - } - // Find deletes & adds. - $xstart = $xi; - while ($xi < $n_from && $this->xchanged[$xi]){ - ++$xi; - } - - $ystart = $yi; - while ($yi < $n_to && $this->ychanged[$yi]){ - ++$yi; - } - - if ($xi>$xstart || $yi>$ystart){ - $ranges[] = new RangeDifference($xstart,$xi,$ystart,$yi); - } - } - wfProfileOut( __METHOD__ ); - return $ranges; - } - function diff_local ($from_lines, $to_lines) { global $wgExternalDiffEngine; wfProfileIn( __METHOD__); if($wgExternalDiffEngine == 'wikidiff3'){ // wikidiff3 - global $IP; - require_once( "$IP/includes/Diff.php" ); $wikidiff3 = new WikiDiff3(); - list($this->xchanged, $this->ychanged) = $wikidiff3->diff($from_lines, $to_lines); + $wikidiff3->diff($from_lines, $to_lines); + $this->xchanged = $wikidiff3->removed; + $this->ychanged = $wikidiff3->added; + unset($wikidiff3); }else{ // old diff $n_from = sizeof($from_lines); @@ -2145,4 +2086,4 @@ class TableDiffFormatter extends DiffFormatter { } wfProfileOut( __METHOD__ ); } -} +} \ No newline at end of file diff --git a/includes/HTMLDiff.php b/includes/HTMLDiff.php index 7e1cc16547..ac6d5ae7ee 100644 --- a/includes/HTMLDiff.php +++ b/includes/HTMLDiff.php @@ -17,46 +17,42 @@ * or see http://www.gnu.org/ */ +/** + * The HTML differ depends on WikiDiff3 + */ +global $IP; +require_once( "$IP/includes/Diff.php" ); /** * Any element in the DOM tree of an HTML document. */ -abstract class Node{ +class Node{ + + public $parent; - protected $parent; + protected $parentTree; + public $whiteBefore = false; + + public $whiteAfter = false; function __construct($parent){ $this->parent = $parent; - if(!is_null($parent)){ - $parent->addChild($this); - } - } - - public function getParent(){ - return $this->parent; } public function getParentTree(){ - if(!is_null($this->parent)){ - $parentTree = $this->parent->getParentTree(); - $parentTree[] = $this->parent; - return $parentTree; - }else{ - return array(); + if(!isset($this->parentTree)){ + if(!is_null($this->parent)){ + $this->parentTree = $this->parent->getParentTree(); + $this->parentTree[] = $this->parent; + }else{ + $this->parentTree = array(); + } } - } - - public abstract function getMinimalDeletedSet($id); - - public function detectIgnorableWhiteSpace(){ - // no op + return $this->parentTree; } public function getLastCommonParent(Node $other){ - if(is_null($other)) - throw new Exception('The given node is NULL'); - $result = new LastCommonParentResult(); $myParents = $this->getParentTree(); @@ -64,8 +60,10 @@ abstract class Node{ $i = 1; $isSame = true; - while ($isSame && $i < sizeof($myParents) && $i < sizeof($otherParents)) { - if (!$myParents[$i]->isSameTag($otherParents[$i])) { + $nbMyParents = sizeof($myParents); + $nbOtherParents = sizeof($otherParents); + while ($isSame && $i < $nbMyParents && $i < $nbOtherParents) { + if (!$myParents[$i]->openingTag === $otherParents[$i]->openingTag) { $isSame = false; } else { // After the while, the index i-1 must be the last common parent @@ -73,32 +71,31 @@ abstract class Node{ } } - $result->setLastCommonParentDepth($i - 1); - $result->setLastCommonParent($myParents[$i - 1]); + $result->lastCommonParentDepth = $i - 1; + $result->parent = $myParents[$i - 1]; if (!$isSame) { - $result->setIndexInLastCommonParent($myParents[$i - 1]->getIndexOf($myParents[$i])); - $result->setSplittingNeeded(); - } else if (sizeof($myParents) < sizeof($otherParents)) { - $result->setIndexInLastCommonParent($myParents[$i - 1]->getIndexOf($this)); - } else if (sizeof($myParents) > sizeof($otherParents)) { + $result->indexInLastCommonParent = $myParents[$i - 1]->getIndexOf($myParents[$i]); + $result->splittingNeeded = true; + } else if ($nbMyParents < $nbOtherParents) { + $result->indexInLastCommonParent = $myParents[$i - 1]->getIndexOf($this); + } else if ($nbMyParents > $nbOtherParents) { // All tags matched but there are tags left in this tree - $result->setIndexInLastCommonParent($myParents[$i - 1]->getIndexOf($myParents[$i])); - $result->setSplittingNeeded(); + $result->indexInLastCommonParent = $myParents[$i - 1]->getIndexOf($myParents[$i]); + $result->splittingNeeded = true; } else { // All tags matched untill the very last one in both trees // or there were no tags besides the BODY - $result->setIndexInLastCommonParent($myParents[$i - 1]->getIndexOf($this)); + $result->indexInLastCommonParent = $myParents[$i - 1]->getIndexOf($this); } return $result; } public function setParent($parent) { $this->parent = $parent; + unset($this->parentTree); } - public abstract function copyTree(); - public function inPre() { $tree = $this->getParentTree(); foreach ($tree as $ancestor) { @@ -108,30 +105,6 @@ abstract class Node{ } return false; } - - private $whiteBefore = false; - - private $whiteAfter = false; - - public function isWhiteBefore() { - return $this->whiteBefore; - } - - public function setWhiteBefore($whiteBefore) { - $this->whiteBefore = $whiteBefore; - } - - public function isWhiteAfter() { - return $this->whiteAfter; - } - - public function setWhiteAfter($whiteAfter) { - $this->whiteAfter = $whiteAfter; - } - - public abstract function getLeftMostChild(); - - public abstract function getRightMostChild(); } /** @@ -141,9 +114,11 @@ class TagNode extends Node{ public $children = array(); - protected $qName; + public $qName; + + public $attributes = array(); - protected $attributes = array(); + public $openingTag; function __construct($parent, $qName, /*array*/ $attributes) { parent::__construct($parent); @@ -151,17 +126,15 @@ class TagNode extends Node{ foreach($attributes as $key => $value){ $this->attributes[strtolower($key)] = $value; } + $this->openingTag = '<'.$this->qName; + foreach ($this->attributes as $attribute => $value) { + $this->openingTag .= ' ' . $attribute . '="' . $value . '"'; + } + return $this->openingTag .= '>'; } - public function addChild(Node $node, $index=-1) { - if ($node->getParent() !== $this) - throw new Exception( - 'The new child must have this node as a parent.'); - if($index == -1){ - $this->children[] = $node; - }else{ - array_splice($this->children,$index,0,array($node)); - } + public function addChildAbsolute(Node $node, $index) { + array_splice($this->children,$index,0,array($node)); } public function getIndexOf(Node $child) { @@ -174,105 +147,75 @@ class TagNode extends Node{ return NULL; } - public function getChild($i) { - return $this->children[$i]; - } - public function getNbChildren() { return count($this->children); } - public function getQName() { - return $this->qName; - } - - public function getAttributes() { - return $this->attributes; - } - - public function isSameTag(TagNode $other) { - if (is_null($other)) - return false; - return $this->getOpeningTag() === $other->getOpeningTag(); - } - - public function getOpeningTag() { - $s = '<'.$this->getQName(); - foreach ($this->attributes as $attribute => $value) { - $s .= ' ' . $attribute . '="' . $value . '"'; - } - return $s .= '>'; - } - - public function getEndTag() { - return 'getQName() + '>"'; - } - - public function getMinimalDeletedSet($id) { + public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) { $nodes = array(); + if (empty($this->children)){ + $allDeleted = false; + $somethingDeleted = false; + return $nodes; + } - if ($this->getNbChildren() == 0) - return $nodes; - + $allDeleted = false; + $somethingDeleted = false; $hasNotDeletedDescendant = false; foreach ($this->children as $child) { - $childrenChildren = $child->getMinimalDeletedSet($id); - $nodes = array_merge($nodes, $childrenChildren); - if (!$hasNotDeletedDescendant - && !(count($childrenChildren) == 1 && $childrenChildren[0]===$child)) { - // This child is not entirely deleted - $hasNotDeletedDescendant = true; + $childrenChildren = $child->getMinimalDeletedSet($id, $allDeleted_local, $somethingDeleted_local); + if($somethingDeleted_local){ + $nodes = array_merge($nodes, $childrenChildren); + $somethingDeleted = true; } + $hasNotDeletedDescendant |= !$allDeleted_local; } if (!$hasNotDeletedDescendant) { $nodes = array($this); + $allDeleted = true; } return $nodes; } - public function toString() { - return $this->getOpeningTag(); - } - public function splitUntill(TagNode $parent, Node $split, $includeLeft) { $splitOccured = false; if ($parent !== $this) { - $part1 = new TagNode(NULL, $this->getQName(), $this->getAttributes()); - $part2 = new TagNode(NULL, $this->getQName(), $this->getAttributes()); - $part1->setParent($this->getParent()); - $part2->setParent($this->getParent()); + $part1 = new TagNode(NULL, $this->qName, $this->attributes); + $part2 = new TagNode(NULL, $this->qName, $this->attributes); + $part1->setParent($this->parent); + $part2->setParent($this->parent); $i = 0; $nbChildren = $this->getNbChildren(); while ($i < $nbChildren && $this->children[$i] !== $split) { $this->children[$i]->setParent($part1); - $part1->addChild($this->children[$i]); + $part1->children[] = $this->children[$i]; ++$i; } if ($i < $nbChildren) { if ($includeLeft) { $this->children[$i]->setParent($part1); - $part1->addChild($this->children[$i]); + $part1->children[] = $this->children[$i]; } else { $this->children[$i]->setParent($part2); - $part2->addChild($this->children[$i]); + $part2->children[] = $this->children[$i]; } ++$i; } while ($i < $nbChildren) { $this->children[$i]->setParent($part2); - $part2->addChild($this->children[$i]); + $part2->children[] = $this->children[$i]; ++$i; } $myindexinparent = $this->parent->getIndexOf($this); - if ($part1->getNbChildren() > 0) - $this->parent->addChild($part1,$myindexinparent); + if (!empty($part1->children)) + $this->parent->addChildAbsolute($part1,$myindexinparent); - if ($part2->getNbChildren() > 0) - $this->parent->addChild($part2,$myindexinparent); + if (!empty($part2->children)) + $this->parent->addChildAbsolute($part2,$myindexinparent); - if ($part1->getNbChildren() > 0 && $part2->getNbChildren() > 0) { + if (!empty($part1->children) && !empty($part2->children)) { $splitOccured = true; } @@ -296,40 +239,14 @@ class TagNode extends Node{ 'h1'=>TRUE,'h2'=>TRUE,'h3'=>TRUE,'h4'=>TRUE,'h5'=>TRUE,'pre'=>TRUE,'div'=>TRUE,'ul'=>TRUE,'ol'=>TRUE,'li'=>TRUE, 'table'=>TRUE,'tbody'=>TRUE,'tr'=>TRUE,'td'=>TRUE,'th'=>TRUE,'br'=>TRUE); - public static function isBlockLevelName($qName) { - return array_key_exists(strtolower($qName),self::$blocks); - } - - public static function isBlockLevelNode(Node $node) { - if(! $node instanceof TagNode) - return false; - return self::isBlockLevelName($node->getQName()); - } - - public function isBlockLevel() { - return  self::isBlockLevelNode($this); - } - - public static function isInlineName($qName) { - return !self::isBlockLevelName($qName); - } - - public static function isInlineNode(Node $node) { - return !self::isBlockLevelNode($node); - } - - public function isInline() { - return self::isInlineNode($this); - } - public function copyTree() { - $newThis = new TagNode(NULL, $this->getQName(), $this->getAttributes()); - $newThis->setWhiteBefore($this->isWhiteBefore()); - $newThis->setWhiteAfter($this->isWhiteAfter()); + $newThis = new TagNode(NULL, $this->qName, $this->attributes); + $newThis->whiteBefore = $this->whiteBefore; + $newThis->whiteAfter = $this->whiteAfter; foreach($this->children as $child) { $newChild = $child->copyTree(); $newChild->setParent($newThis); - $newThis->addChild($newChild); + $newThis->children[] = $newChild; } return $newThis; } @@ -345,22 +262,22 @@ class TagNode extends Node{ $nbOriginalChildren = $this->getNbChildren(); for ($i = 0; $i < $nbOriginalChildren; ++$i) { - $child = $this->getChild($i + $shift); + $child = $this->children[$i + $shift]; if($child instanceof TagNode){ if (!$child->isPre()) { $child->expandWhiteSpace(); } } - if (!$spaceAdded && $child->isWhiteBefore()) { + if (!$spaceAdded && $child->whiteBefore) { $ws = new WhiteSpaceNode(NULL, ' ', $child->getLeftMostChild()); $ws->setParent($this); - $this->addChild($ws,$i + ($shift++)); + $this->addChildAbsolute($ws,$i + ($shift++)); } - if ($child->isWhiteAfter()) { + if ($child->whiteAfter) { $ws = new WhiteSpaceNode(NULL, ' ', $child->getRightMostChild()); $ws->setParent($this); - $this->addChild($ws,$i + 1 + ($shift++)); + $this->addChildAbsolute($ws,$i + 1 + ($shift++)); $spaceAdded = true; } else { $spaceAdded = false; @@ -370,24 +287,24 @@ class TagNode extends Node{ } public function getLeftMostChild() { - if ($this->getNbChildren() < 1) + if (empty($this->children)) return $this; - return $this->getChild(0)->getLeftMostChild(); + return $this->children[0]->getLeftMostChild(); } public function getRightMostChild() { - if ($this->getNbChildren() < 1) + if (empty($this->children)) return $this; - return $this->getChild($this->getNbChildren() - 1)->getRightMostChild(); + return $this->children[$this->getNbChildren() - 1]->getRightMostChild(); } public function isPre() { - return 0 == strcasecmp($this->getQName(),'pre'); + return 0 == strcasecmp($this->qName,'pre'); } public static function toDiffLine(TagNode $node){ - return $node->getOpeningTag(); + return $node->openingTag; } } @@ -396,14 +313,14 @@ class TagNode extends Node{ */ class TextNode extends Node{ - private $s; + public $text; - private $modification; + public $modification; - function __construct($parent, $s) { + function __construct($parent, $text) { parent::__construct($parent); $this->modification = new Modification(Modification::NONE); - $this->s = $s; + $this->text = $text; } public function copyTree() { @@ -420,40 +337,25 @@ class TextNode extends Node{ return $this; } - public function getMinimalDeletedSet($id) { - if ($this->getModification()->getType() == Modification::REMOVED - && $this->getModification()->getID() == $id){ + public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) { + if ($this->modification->type == Modification::REMOVED + && $this->modification->id == $id){ + $somethingDeleted = true; + $allDeleted = true; return array($this); } return array(); } - public function getModification() { - return $this->modification; - } - - public function getText() { - return $this->s; - } - public function isSameText($other) { if (is_null($other) || ! $other instanceof TextNode){ return false; } - return str_replace('\n', ' ',$this->getText()) === str_replace('\n', ' ',$other->getText()); - } - - public function setModification(Modification $m) { - //wfDebug("Registered modification for node '$this->s' as ".Modification::typeToString($m->getType())); - $this->modification = $m; - } - - public function toString() { - return $this->getText(); + return str_replace('\n', ' ',$this->text) === str_replace('\n', ' ',$other->text); } public static function toDiffLine(TextNode $node){ - return str_replace('\n', ' ',$node->getText()); + return str_replace('\n', ' ',$node->text); } } @@ -462,21 +364,9 @@ class WhiteSpaceNode extends TextNode { function __construct($parent, $s, Node $like = NULL) { parent::__construct($parent, $s); if(!is_null($like) && $like instanceof TextNode){ - $newModification = clone $like->getModification(); - $newModification->setFirstOfID(false); - $this->setModification($newModification); - } - } - - public static function isWhiteSpace($c) { - switch ($c) { - case ' ': - case '\t': - case '\r': - case '\n': - return true; - default: - return false; + $newModification = clone $like->modification; + $newModification->firstOfID = false; + $this->modification = $newModification; } } } @@ -495,15 +385,15 @@ class BodyNode extends TagNode { foreach ($this->children as $child) { $newChild = $child->copyTree(); $newChild->setParent($newThis); - $newThis->addChild($newChild); + $newThis->children[] = $newChild; } return $newThis; } - public function getMinimalDeletedSet($id) { + public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) { $nodes = array(); foreach ($this->children as $child) { - $childrenChildren = $child->getMinimalDeletedSet($id); + $childrenChildren = $child->getMinimalDeletedSet($id,$allDeleted,$somethingDeleted); $nodes = array_merge($nodes, $childrenChildren); } return $nodes; @@ -535,11 +425,15 @@ class ImageNode extends TextNode { public function isSameText($other) { if (is_null($other) || ! $other instanceof ImageNode) return false; - return $this->getText() === $other->getText(); + return $this->text === $other->text; } - public function getAttributes() { - return $this->attributes; +} + +class DummyNode extends Node{ + + function __construct(){ + // no op } } @@ -551,48 +445,16 @@ class ImageNode extends TextNode { class LastCommonParentResult { // Parent - private $parent; - - public function getLastCommonParent() { - return $this->parent; - } - - public function setLastCommonParent(TagNode $parent) { - $this->parent = $parent; - } + public $parent; // Splitting - private $splittingNeeded = false; - - public function isSplittingNeeded() { - return $this->splittingNeeded; - } - - public function setSplittingNeeded() { - $this->splittingNeeded = true; - } + public $splittingNeeded = false; // Depth - private $lastCommonParentDepth = -1; - - public function getLastCommonParentDepth() { - return $this->lastCommonParentDepth; - } - - public function setLastCommonParentDepth($depth) { - $this->lastCommonParentDepth = $depth; - } + public $lastCommonParentDepth = -1; // Index - private $indexInLastCommonParent = -1; - - public function getIndexInLastCommonParent() { - return $this->indexInLastCommonParent; - } - - public function setIndexInLastCommonParent($index) { - $this->indexInLastCommonParent = $index; - } + public $indexInLastCommonParent = -1; } class Modification{ @@ -602,76 +464,22 @@ class Modification{ const ADDED = 4; const CHANGED = 8; - private $type; + public $type; + + public $id = -1; - private $id = -1; + public $prevMod; - private $prevMod; + public $nextMod; - private $nextMod; + public $firstOfID = false; - private $firstOfID = false; + public $changes; function __construct($type) { $this->type = $type; } - public function copy() { - $newM = new Modification($this->getType()); - $newM->setID($this->getID()); - $newM->setChanges($this->getChanges()); - $newM->setFirstOfID($this->isFirstOfID()); - $newM->setNext($this->getNext()); - $newM->setPrevious($this->getPrevious()); - return $newM; - } - - public function getType() { - return $this->type; - } - - public function setID($id) { - $this->id = $id; - } - - public function getID() { - return $this->id; - } - - public function setPrevious($m) { - $this->prevMod = $m; - } - - public function getPrevious() { - return $this->prevMod; - } - - public function setNext($m) { - $this->nextMod = $m; - } - - public function getNext() { - return $this->nextMod; - } - - private $changes; - - public function setChanges($changes) { - $this->changes = $changes; - } - - public function getChanges() { - return $this->changes; - } - - public function isFirstOfID() { - return $this->firstOfID; - } - - public function setFirstOfID($firstOfID) { - $this->firstOfID = $firstOfID; - } - public static function typeToString($type){ switch($type){ case self::NONE: return 'none'; @@ -684,9 +492,9 @@ class Modification{ class DomTreeBuilder { - private $textNodes = array(); + public $textNodes = array(); - private $bodyNode; + public $bodyNode; private $currentParent; @@ -700,16 +508,11 @@ class DomTreeBuilder { private $lastSibling; + private $notInPre = true; + function __construct(){ $this->bodyNode = $this->currentParent = new BodyNode(); - } - - public function getBodyNode() { - return $this->bodyNode; - } - - public function getTextNodes() { - return $this->textNodes; + $this->lastSibling = new DummyNode(); } /** @@ -725,13 +528,17 @@ class DomTreeBuilder { //wfDebug("Starting $name node."); $this->endWord(); - $newTagNode = new TagNode($this->currentParent, $name, $attributes); - $this->currentParent = $newTagNode; - $this->lastSibling = NULL; - if ($this->whiteSpaceBeforeThis && $newTagNode->isInline()) { - $newTagNode->setWhiteBefore(true); + $newNode = new TagNode($this->currentParent, $name, $attributes); + $this->currentParent->children[] = $newNode; + $this->currentParent = $newNode; + $this->lastSibling = new DummyNode(); + if ($this->whiteSpaceBeforeThis && !array_key_exists(strtolower($this->currentParent->qName),TagNode::$blocks)) { + $this->currentParent->whiteBefore = true; } $this->whiteSpaceBeforeThis = false; + if(strcasecmp($name, 'pre')==0){ + $this->notInPre = false; + } } } @@ -740,54 +547,59 @@ class DomTreeBuilder { //wfDebug("Ending $name node."); if (0 == strcasecmp($name,'img')) { // Insert a dummy leaf for the image - $img = new ImageNode($this->currentParent, $this->currentParent->getAttributes()); - $img->setWhiteBefore($this->whiteSpaceBeforeThis); + $img = new ImageNode($this->currentParent, $this->currentParent->attributes); + $this->currentParent->children[] = $img; + $img->whiteBefore = $this->whiteSpaceBeforeThis; $this->lastSibling = $img; $this->textNodes[] = $img; } $this->endWord(); - if ($this->currentParent->isInline()) { + if (!array_key_exists(strtolower($this->currentParent->qName),TagNode::$blocks)) { $this->lastSibling = $this->currentParent; } else { - $this->lastSibling = NULL; + $this->lastSibling = new DummyNode(); } - $this->currentParent = $this->currentParent->getParent(); + $this->currentParent = $this->currentParent->parent; $this->whiteSpaceBeforeThis = false; + if(!$this->notInPre && strcasecmp($name, 'pre')==0){ + $this->notInPre = true; + } }else{ $this->endDocument(); } } + const regex = '/([\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1})/'; + const whitespace = '/^[\s]{1}$/'; + const delimiter = '/^[\s\.\,\"\\\'\(\)\?\:\;\!\{\}\-\+\*\=\_\[\]\&\|\$]{1}$/'; + public function characters($parser, $data){ - //wfDebug('Parsing '. strlen($data).' characters.'); - $array = str_split($data); - foreach($array as $c) { - if (self::isDelimiter($c)) { + $matches = preg_split(self::regex,$data,-1,PREG_SPLIT_DELIM_CAPTURE); + + foreach($matches as $word){ + if(preg_match(self::whitespace, $word) && $this->notInPre){ $this->endWord(); - if (WhiteSpaceNode::isWhiteSpace($c) && !$this->currentParent->isPre() - && !$this->currentParent->inPre()) { - if (!is_null($this->lastSibling)){ - $this->lastSibling->setWhiteAfter(true); - } - $this->whiteSpaceBeforeThis = true; - } else { - $textNode = new TextNode($this->currentParent, $c); - $textNode->setWhiteBefore($this->whiteSpaceBeforeThis); - $this->whiteSpaceBeforeThis = false; - $this->lastSibling = $textNode; - $this->textNodes[] = $textNode; - } - } else { - $this->newWord .= $c; + $this->lastSibling->whiteAfter = true; + $this->whiteSpaceBeforeThis = true; + }else if(preg_match(self::delimiter, $word)){ + $this->endWord(); + $textNode = new TextNode($this->currentParent, $word); + $this->currentParent->children[] = $textNode; + $textNode->whiteBefore = $this->whiteSpaceBeforeThis; + $this->whiteSpaceBeforeThis = false; + $this->lastSibling = $textNode; + $this->textNodes[] = $textNode; + }else{ + $this->newWord .= $word; } - } } private function endWord() { - if (strlen($this->newWord) > 0) { + if (!empty($this->newWord)) { $node = new TextNode($this->currentParent, $this->newWord); - $node->setWhiteBefore($this->whiteSpaceBeforeThis); + $this->currentParent->children[] = $node; + $node->whiteBefore = $this->whiteSpaceBeforeThis; $this->whiteSpaceBeforeThis = false; $this->lastSibling = $node; $this->textNodes[] = $node; @@ -795,39 +607,6 @@ class DomTreeBuilder { } } - public static function isDelimiter($c) { - switch ($c) { - // Basic Delimiters - case '/': - case '.': - case '!': - case ',': - case ';': - case '?': - case '=': - case '\'': - case '"': - // Extra Delimiters - case '[': - case ']': - case '{': - case '}': - case '(': - case ')': - case '&': - case '|': - case '\\': - case '-': - case '_': - case '+': - case '*': - case ':': - return true; - default: - return WhiteSpaceNode::isWhiteSpace($c); - } - } - public function getDiffLines(){ return array_map(array('TextNode','toDiffLine'), $this->textNodes); } @@ -836,7 +615,7 @@ class DomTreeBuilder { class TextNodeDiffer{ private $textNodes; - private $bodyNode; + public $bodyNode; private $oldTextNodes; private $oldBodyNode; @@ -844,14 +623,10 @@ class TextNodeDiffer{ private $lastModified = array(); function __construct(DomTreeBuilder $tree, DomTreeBuilder $oldTree) { - $this->textNodes = $tree->getTextNodes(); - $this->bodyNode = $tree->getBodyNode(); - $this->oldTextNodes = $oldTree->getTextNodes(); - $this->oldBodyNode = $oldTree->getBodyNode(); - } - - public function getBodyNode() { - return $this->bodyNode; + $this->textNodes = $tree->textNodes; + $this->bodyNode = $tree->bodyNode; + $this->oldTextNodes = $oldTree->textNodes; + $this->oldBodyNode = $oldTree->bodyNode; } private $newID = 0; @@ -861,26 +636,26 @@ class TextNodeDiffer{ return; if ($this->whiteAfterLastChangedPart) - $this->textNodes[$start]->setWhiteBefore(false); + $this->textNodes[$start]->whiteBefore = false; $nextLastModified = array(); for ($i = $start; $i < $end; ++$i) { $mod = new Modification(Modification::ADDED); - $mod->setID($this->newID); + $mod->id = $this->newID; if (sizeof($this->lastModified) > 0) { - $mod->setPrevious($this->lastModified[0]); - if (is_null($this->lastModified[0]->getNext())) { + $mod->prevMod = $this->lastModified[0]; + if (is_null($this->lastModified[0]->nextMod )) { foreach ($this->lastModified as $lastMod) { - $lastMod->setNext($mod); + $lastMod->nextMod = $mod; } } } $nextLastModified[] = $mod; - $this->textNodes[$i]->setModification($mod); + $this->textNodes[$i]->modification = $mod; } if ($start < $end) { - $this->textNodes[$start]->getModification()->setFirstOfID(true); + $this->textNodes[$start]->modification->firstOfID = true; } ++$this->newID; $this->lastModified = $nextLastModified; @@ -909,18 +684,18 @@ class TextNodeDiffer{ unset($acthis, $acother); $nbLastModified = sizeof($this->lastModified); - if ($result->isChanged()) { + if ($result->changed) { $mod = new Modification(Modification::CHANGED); if (!$this->changedIDUsed) { - $mod->setFirstOfID(true); + $mod->firstOfID = true; if (sizeof($nextLastModified) > 0) { $this->lastModified = $nextLastModified; $nextLastModified = array(); } - } else if (!is_null($result->getChanges()) && $result->getChanges() != $this->changes) { + } else if (!is_null($result->changes) && $result->changes !== $this->changes) { ++$this->changedID; - $mod->setFirstOfID(true); + $mod->firstOfID = true; if (sizeof($nextLastModified) > 0) { $this->lastModified = $nextLastModified; $nextLastModified = array(); @@ -928,20 +703,20 @@ class TextNodeDiffer{ } if ($nbLastModified > 0) { - $mod->setPrevious($this->lastModified[0]); - if (is_null($this->lastModified[0]->getNext())) { + $mod->prevMod = $this->lastModified[0]; + if (is_null($this->lastModified[0]->nextMod )) { foreach ($this->lastModified as $lastMod) { - $lastMod->setNext($mod); + $lastMod->nextMod = $mod; } } } $nextLastModified[] = $mod; - $mod->setChanges($result->getChanges()); - $mod->setID($this->changedID); + $mod->changes = $result->changes; + $mod->id = $this->changedID; - $this->textNodes[$i]->setModification($mod); - $this->changes = $result->getChanges(); + $this->textNodes[$i]->modification = $mod; + $this->changes = $result->changes; $this->changedIDUsed = true; } else if ($this->changedIDUsed) { ++$this->changedID; @@ -965,7 +740,7 @@ class TextNodeDiffer{ if ($end <= $start) return; - if ($before > 0 && $this->textNodes[$before - 1]->isWhiteAfter()) { + if ($before > 0 && $this->textNodes[$before - 1]->whiteAfter) { $this->whiteAfterLastChangedPart = true; } else { $this->whiteAfterLastChangedPart = false; @@ -975,12 +750,12 @@ class TextNodeDiffer{ for ($i = $start; $i < $end; ++$i) { $mod = new Modification(Modification::REMOVED); - $mod->setID($this->deletedID); + $mod->id = $this->deletedID; if (sizeof($this->lastModified) > 0) { - $mod->setPrevious($this->lastModified[0]); - if (is_null($this->lastModified[0]->getNext())) { + $mod->prevMod = $this->lastModified[0]; + if (is_null($this->lastModified[0]->nextMod )) { foreach ($this->lastModified as $lastMod) { - $lastMod->setNext($mod); + $lastMod->nextMod = $mod; } } } @@ -989,11 +764,13 @@ class TextNodeDiffer{ // oldTextNodes is used here because we're going to move its deleted // elements // to this tree! - $this->oldTextNodes[$i]->setModification($mod); + $this->oldTextNodes[$i]->modification = $mod; } - $this->oldTextNodes[$start]->getModification()->setFirstOfID(true); + $this->oldTextNodes[$start]->modification->firstOfID = true; + + $root = $this->oldTextNodes[$start]->getLastCommonParent($this->oldTextNodes[$end-1])->parent; - $deletedNodes = $this->oldBodyNode->getMinimalDeletedSet($this->deletedID); + $deletedNodes = $root->getMinimalDeletedSet($this->deletedID, $junk1, $junk2); //wfDebug("Minimal set of deleted nodes of size " . sizeof($deletedNodes)); @@ -1013,63 +790,63 @@ class TextNodeDiffer{ $prevResult = $prevLeaf->getLastCommonParent($deletedNodes[0]); } else { $prevResult = new LastCommonParentResult(); - $prevResult->setLastCommonParent($this->getBodyNode()); - $prevResult->setIndexInLastCommonParent(0); + $prevResult->parent = $this->bodyNode; + $prevResult->indexInLastCommonParent = 0; } if (isset($nextleaf)) { $nextResult = $nextLeaf->getLastCommonParent($deletedNodes[sizeof($deletedNodes) - 1]); } else { $nextResult = new LastCommonParentResult(); - $nextResult->setLastCommonParent($this->getBodyNode()); - $nextResult->setIndexInLastCommonParent($this->getBodyNode()->getNbChildren()); + $nextResult->parent = $this->bodyNode; + $nextResult->indexInLastCommonParent = $this->bodyNode->getNbChildren(); } - if ($prevResult->getLastCommonParentDepth() == $nextResult->getLastCommonParentDepth()) { + if ($prevResult->lastCommonParentDepth == $nextResult->lastCommonParentDepth) { // We need some metric to choose which way to add-... - if ($deletedNodes[0]->getParent() === $deletedNodes[sizeof($deletedNodes) - 1]->getParent() - && $prevResult->getLastCommonParent() === $nextResult->getLastCommonParent()) { + if ($deletedNodes[0]->parent === $deletedNodes[sizeof($deletedNodes) - 1]->parent + && $prevResult->parent === $nextResult->parent) { // The difference is not in the parent - $prevResult->setLastCommonParentDepth($prevResult->getLastCommonParentDepth() + 1); + $prevResult->lastCommonParentDepth = $prevResult->lastCommonParentDepth + 1; } else { // The difference is in the parent, so compare them // now THIS is tricky - $distancePrev = $deletedNodes[0]->getParent()->getMatchRatio($prevResult->getLastCommonParent()); - $distanceNext = $deletedNodes[sizeof($deletedNodes) - 1]->getParent()->getMatchRatio($nextResult->getLastCommonParent()); + $distancePrev = $deletedNodes[0]->parent->getMatchRatio($prevResult->parent); + $distanceNext = $deletedNodes[sizeof($deletedNodes) - 1]->parent->getMatchRatio($nextResult->parent); if ($distancePrev <= $distanceNext) { - $prevResult->setLastCommonParentDepth($prevResult->getLastCommonParentDepth() + 1); + $prevResult->lastCommonParentDepth = $prevResult->lastCommonParentDepth + 1; } else { - $nextResult->setLastCommonParentDepth($nextResult->getLastCommonParentDepth() + 1); + $nextResult->lastCommonParentDepth = $nextResult->lastCommonParentDepth + 1; } } } - if ($prevResult->getLastCommonParentDepth() > $nextResult->getLastCommonParentDepth()) { + if ($prevResult->lastCommonParentDepth > $nextResult->lastCommonParentDepth) { // Inserting at the front - if ($prevResult->isSplittingNeeded()) { - $prevLeaf->getParent()->splitUntill($prevResult->getLastCommonParent(), $prevLeaf, true); + if ($prevResult->splittingNeeded) { + $prevLeaf->parent->splitUntill($prevResult->parent, $prevLeaf, true); } $prevLeaf = $deletedNodes[0]->copyTree(); unset($deletedNodes[0]); $deletedNodes = array_values($deletedNodes); - $prevLeaf->setParent($prevResult->getLastCommonParent()); - $prevResult->getLastCommonParent()->addChild($prevLeaf,$prevResult->getIndexInLastCommonParent() + 1); - } else if ($prevResult->getLastCommonParentDepth() < $nextResult->getLastCommonParentDepth()) { + $prevLeaf->setParent($prevResult->parent); + $prevResult->parent->addChildAbsolute($prevLeaf,$prevResult->indexInLastCommonParent + 1); + } else if ($prevResult->lastCommonParentDepth < $nextResult->lastCommonParentDepth) { // Inserting at the back - if ($nextResult->isSplittingNeeded()) { - $splitOccured = $nextLeaf->getParent()->splitUntill($nextResult->getLastCommonParent(), $nextLeaf, false); + if ($nextResult->splittingNeeded) { + $splitOccured = $nextLeaf->parent->splitUntill($nextResult->parent, $nextLeaf, false); if ($splitOccured) { // The place where to insert is shifted one place to the // right - $nextResult->setIndexInLastCommonParent($nextResult->getIndexInLastCommonParent() + 1); + $nextResult->indexInLastCommonParent = $nextResult->indexInLastCommonParent + 1; } } $nextLeaf = $deletedNodes[sizeof(deletedNodes) - 1]->copyTree(); unset($deletedNodes[sizeof(deletedNodes) - 1]); $deletedNodes = array_values($deletedNodes); - $nextLeaf->setParent($nextResult->getLastCommonParent()); - $nextResult->getLastCommonParent()->addChild($nextLeaf,$nextResult->getIndexInLastCommonParent()); + $nextLeaf->setParent($nextResult->parent); + $nextResult->parent->addChildAbsolute($nextLeaf,$nextResult->indexInLastCommonParent); } else throw new Exception("Uh?"); } @@ -1078,7 +855,7 @@ class TextNodeDiffer{ } public function expandWhiteSpace() { - $this->getBodyNode()->expandWhiteSpace(); + $this->bodyNode->expandWhiteSpace(); } public function lengthNew(){ @@ -1091,14 +868,15 @@ class TextNodeDiffer{ } class HTMLDiffer{ - + private $output; - + function __construct($output){ $this->output = $output; } function htmlDiff($from, $to){ + wfProfileIn( __METHOD__ ); // Create an XML parser $xml_parser = xml_parser_create(''); @@ -1110,12 +888,11 @@ class HTMLDiffer{ // Set the function to handle blocks of character data xml_set_character_data_handler($xml_parser, array($domfrom,"characters")); - ; - //wfDebug('Parsing '.strlen($from)." characters worth of HTML"); + //wfDebug('Parsing '.strlen($from)." characters worth of HTML\n"); if (!xml_parse($xml_parser, ''.Sanitizer::hackDocType().'', FALSE) || !xml_parse($xml_parser, $from, FALSE) || !xml_parse($xml_parser, '', TRUE)){ - wfDebug(sprintf("XML error: %s at line %d",xml_error_string(xml_get_error_code($xml_parser)),xml_get_current_line_number($xml_parser))); + wfDebug(sprintf("XML error: %s at line %d\n",xml_error_string(xml_get_error_code($xml_parser)),xml_get_current_line_number($xml_parser))); } xml_parser_free($xml_parser); unset($from); @@ -1130,19 +907,20 @@ class HTMLDiffer{ // Set the function to handle blocks of character data xml_set_character_data_handler($xml_parser, array($domto,"characters")); - //wfDebug('Parsing '.strlen($to)." characters worth of HTML"); + //wfDebug('Parsing '.strlen($to)." characters worth of HTML\n"); if (!xml_parse($xml_parser, ''.Sanitizer::hackDocType().'', FALSE) || !xml_parse($xml_parser, $to, FALSE) || !xml_parse($xml_parser, '', TRUE)){ - wfDebug(sprintf("XML error in HTML diff: %s at line %d",xml_error_string(xml_get_error_code($xml_parser)),xml_get_current_line_number($xml_parser))); + wfDebug(sprintf("XML error in HTML diff: %s at line %d\n",xml_error_string(xml_get_error_code($xml_parser)),xml_get_current_line_number($xml_parser))); } xml_parser_free($xml_parser); unset($to); - $diffengine = new _DiffEngine(); + $diffengine = new WikiDiff3(); $differences = $this->preProcess($diffengine->diff_range($domfrom->getDiffLines(), $domto->getDiffLines())); unset($xml_parser,$diffengine); + $domdiffer = new TextNodeDiffer($domto, $domfrom); $currentIndexLeft = 0; @@ -1160,13 +938,14 @@ class HTMLDiffer{ $currentIndexLeft = $d->leftend; $currentIndexRight = $d->rightend; } - if ($currentIndexLeft < $domdiffer->lengthOld()) { - $domdiffer->handlePossibleChangedPart($currentIndexLeft,$domdiffer->lengthOld(), $currentIndexRight,$domdiffer->lengthNew()); + $oldLength = $domdiffer->lengthOld(); + if ($currentIndexLeft < $oldLength) { + $domdiffer->handlePossibleChangedPart($currentIndexLeft,$oldLength, $currentIndexRight,$domdiffer->lengthNew()); } - $domdiffer->expandWhiteSpace(); $output = new HTMLOutput('htmldiff', $this->output); - $output->parse($domdiffer->getBodyNode()); + $output->parse($domdiffer->bodyNode); + wfProfileOut( __METHOD__ ); } private function preProcess(/*array*/ $differences){ @@ -1244,39 +1023,23 @@ class TextOnlyComparator{ if($nbOthers == 0 || $nbThis == 0){ return -log(0); } - - $diffengine = new _DiffEngine(); - $diffengine->diff_local($this->leafs, $other->leafs); - - $distanceThis = array_sum($diffengine->xchanged); - $distanceOther = array_sum($diffengine->ychanged); - return ((0.0 + $distanceOther) / $nbOthers + (0.0 + $distanceThis) - / $nbThis) / 2.0; - } -} + $diffengine = new WikiDiff3(25000,1.35); + $diffengine->diff($this->leafs, $other->leafs); -class AncestorComparatorResult { + $lcsLength = $diffengine->getLcsLength(); - private $changed = false; + $distanceThis = $nbThis-$lcsLength; - private $changes = ""; - - public function isChanged() { - return $this->changed; + return (2.0 - $lcsLength/$nbOthers - $lcsLength/$nbThis) / 2.0; } +} - public function setChanged($changed) { - $this->changed = $changed; - } +class AncestorComparatorResult { - public function getChanges() { - return $this->changes; - } + public $changed = false; - public function setChanges($changes) { - $this->changes = $changes; - } + public $changes = ""; } /** @@ -1292,16 +1055,12 @@ class AncestorComparator{ $this->ancestorsText = array_map(array('TagNode','toDiffLine'), $ancestors); } - private $compareTxt = ""; - - public function getCompareTxt() { - return $this->compareTxt; - } + public $compareTxt = ""; public function getResult(AncestorComparator $other) { $result = new AncestorComparatorResult(); - $diffengine = new _DiffEngine(); + $diffengine = new WikiDiff3(10000,1.35); $differences = $diffengine->diff_range($this->ancestorsText, $other->ancestorsText); if (sizeof($differences) == 0){ @@ -1309,8 +1068,8 @@ class AncestorComparator{ } $changeTxt = new ChangeTextGenerator($this, $other); - $result->setChanged(true); - $result->setChanges($changeTxt->getChanged($differences)->toString()); + $result->changed = true; + $result->changes = $changeTxt->getChanged($differences)->toString(); return $result; } @@ -1491,11 +1250,11 @@ class TagToStringFactory { const UNKNOWN = 4; public function create(TagNode $node) { - $sem = $this->getChangeSemantic($node->getQName()); - if (0 == strcasecmp($node->getQName(),'a')){ + $sem = $this->getChangeSemantic($node->qName); + if (0 == strcasecmp($node->qName,'a')){ return new AnchorToString($node, $sem); } - if (0 == strcasecmp($node->getQName(),'img')){ + if (0 == strcasecmp($node->qName,'img')){ return new NoContentTagToString($node, $sem); } return new TagToString($node, $sem); @@ -1524,11 +1283,10 @@ class TagToString { } public function getDescription() { - return $this->getString('diff-' . $this->node->getQName()); + return $this->getString('diff-' . $this->node->qName); } public function getRemovedDescription(ChangeText $txt) { - if ($this->sem == TagToStringFactory::MOVED) { $txt->addText($this->getMovedOutOf() . ' ' . strtolower($this->getArticle()) . ' '); $txt->addHtml(''); @@ -1545,12 +1303,11 @@ class TagToString { $txt->addHtml(''); $txt->addText(' ' . strtolower($this->getRemoved())); } - $this->addAttributes($txt, $this->node->getAttributes()); + $this->addAttributes($txt, $this->node->attributes); $txt->addText('.'); } public function getAddedDescription(ChangeText $txt) { - if ($this->sem == TagToStringFactory::MOVED) { $txt->addText($this->getMovedTo() . ' ' . strtolower($this->getArticle()) . ' '); $txt->addHtml(''); @@ -1567,7 +1324,7 @@ class TagToString { $txt->addHtml(''); $txt->addText(' ' . strtolower($this->getAdded())); } - $this->addAttributes($txt, $this->node->getAttributes()); + $this->addAttributes($txt, $this->node->attributes); $txt->addText('.'); } @@ -1648,7 +1405,7 @@ class TagToString { } protected function getArticle() { - return $this->getString('diff-' . $this->node->getQName() . '-article'); + return $this->getString('diff-' . $this->node->qName . '-article'); } public static $bundle = array( @@ -1756,7 +1513,7 @@ class NoContentTagToString extends TagToString { $txt.addText(strtolower($this->getDescription())); $txt.addHtml(''); - $this->addAttributes($txt, $this->node->getAttributes()); + $this->addAttributes($txt, $this->node->attributes); $txt.addText('.'); } @@ -1770,7 +1527,7 @@ class NoContentTagToString extends TagToString { $txt.addText(strtolower($this->getDescription())); $txt.addHtml(''); - $this->addAttributes($txt, $this->node->getAttributes()); + $this->addAttributes($txt, $this->node->attributes); $txt.addText('.'); } @@ -1813,9 +1570,10 @@ class HTMLOutput{ } public function parse(TagNode $node) { + $handler = &$this->handler; - if (0 != strcasecmp($node->getQName(),'img') && 0 != strcasecmp($node->getQName(),'body')) { - $this->handler->startElement($node->getQName(), $node->getAttributes()); + if (0 != strcasecmp($node->qName,'img') && 0 != strcasecmp($node->qName,'body')) { + $handler->startElement($node->qName, $node->attributes); } $newStarted = false; @@ -1826,100 +1584,100 @@ class HTMLOutput{ foreach ($node->children as $child) { if ($child instanceof TagNode) { if ($newStarted) { - $this->handler->endElement('span'); + $handler->endElement('span'); $newStarted = false; } else if ($changeStarted) { - $this->handler->endElement('span'); + $handler->endElement('span'); $changeStarted = false; } else if ($remStarted) { - $this->handler->endElement('span'); + $handler->endElement('span'); $remStarted = false; } $this->parse($child); } else if ($child instanceof TextNode) { - $mod = $child->getModification(); + $mod = $child->modification; - if ($newStarted && ($mod->getType() != Modification::ADDED || $mod->isFirstOfID())) { - $this->handler->endElement('span'); + if ($newStarted && ($mod->type != Modification::ADDED || $mod->firstOfID)) { + $handler->endElement('span'); $newStarted = false; - } else if ($changeStarted && ($mod->getType() != Modification::CHANGED || $mod->getChanges() != $changeTXT || $mod->isFirstOfID())) { - $this->handler->endElement('span'); + } else if ($changeStarted && ($mod->type != Modification::CHANGED || $mod->changes != $changeTXT || $mod->firstOfID)) { + $handler->endElement('span'); $changeStarted = false; - } else if ($remStarted && ($mod->getType() != Modification::REMOVED || $mod ->isFirstOfID())) { - $this->handler->endElement('span'); + } else if ($remStarted && ($mod->type != Modification::REMOVED || $mod ->firstOfID)) { + $handler->endElement('span'); $remStarted = false; } // no else because a removed part can just be closed and a new // part can start - if (!$newStarted && $mod->getType() == Modification::ADDED) { + if (!$newStarted && $mod->type == Modification::ADDED) { $attrs = array('class'=>'diff-html-added'); - if ($mod->isFirstOfID()) { - $attrs['id'] = 'added-' . $this->prefix . '-' . $mod->getID(); + if ($mod->firstOfID) { + $attrs['id'] = 'added-' . $this->prefix . '-' . $mod->id; } $this->addAttributes($mod, $attrs); $attrs['onclick'] = 'return tipA(constructToolTipA(this));'; - $this->handler->startElement('span', $attrs); + $handler->startElement('span', $attrs); $newStarted = true; - } else if (!$changeStarted && $mod->getType() == Modification::CHANGED) { + } else if (!$changeStarted && $mod->type == Modification::CHANGED) { $attrs = array('class'=>'diff-html-changed'); - if ($mod->isFirstOfID()) { - $attrs['id'] = 'changed-' . $this->prefix . '-' . $mod->getID(); + if ($mod->firstOfID) { + $attrs['id'] = 'changed-' . $this->prefix . '-' . $mod->id; } $this->addAttributes($mod, $attrs); $attrs['onclick'] = 'return tipC(constructToolTipC(this));'; - $this->handler->startElement('span', $attrs); - + $handler->startElement('span', $attrs); + //tooltip - $this->handler->startElement('span', array('class'=>'tip')); - $this->handler->characters($mod->getChanges()); - $this->handler->endElement('span'); - + $handler->startElement('span', array('class'=>'tip')); + $handler->characters($mod->changes); + $handler->endElement('span'); + $changeStarted = true; - $changeTXT = $mod->getChanges(); - } else if (!$remStarted && $mod->getType() == Modification::REMOVED) { + $changeTXT = $mod->changes; + } else if (!$remStarted && $mod->type == Modification::REMOVED) { $attrs = array('class'=>'diff-html-removed'); - if ($mod->isFirstOfID()) { - $attrs['id'] = 'removed-' . $this->prefix . '-' . $mod->getID(); + if ($mod->firstOfID) { + $attrs['id'] = 'removed-' . $this->prefix . '-' . $mod->id; } $this->addAttributes($mod, $attrs); $attrs['onclick'] = 'return tipR(constructToolTipR(this));'; - $this->handler->startElement('span', $attrs); + $handler->startElement('span', $attrs); $remStarted = true; } - $chars = $child->getText(); + $chars = $child->text; if ($child instanceof ImageNode) { $this->writeImage($child); } else { - $this->handler->characters($chars); + $handler->characters($chars); } } } if ($newStarted) { - $this->handler->endElement('span'); + $handler->endElement('span'); $newStarted = false; } else if ($changeStarted) { - $this->handler->endElement('span'); + $handler->endElement('span'); $changeStarted = false; } else if ($remStarted) { - $this->handler->endElement('span'); + $handler->endElement('span'); $remStarted = false; } - if (0 != strcasecmp($node->getQName(),'img') - && 0 != strcasecmp($node->getQName(),'body')) - $this->handler->endElement($node->getQName()); + if (0 != strcasecmp($node->qName,'img') + && 0 != strcasecmp($node->qName,'body')) + $handler->endElement($node->qName); } private function writeImage(ImageNode $imgNode){ - $attrs = $imgNode->getAttributes(); - if ($imgNode->getModification()->getType() == Modification::REMOVED) + $attrs = $imgNode->attributes; + if ($imgNode->modification->type == Modification::REMOVED) $attrs['changeType']='diff-removed-image'; - else if ($imgNode->getModification()->getType() == Modification::ADDED) + else if ($imgNode->modification->type == Modification::ADDED) $attrs['changeType'] = 'diff-added-image'; $attrs['onload'] = 'updateOverlays()'; $attrs['onError'] = 'updateOverlays()'; @@ -1929,22 +1687,22 @@ class HTMLOutput{ } private function addAttributes(Modification $mod, /*array*/ &$attrs) { - if (is_null($mod->getPrevious())) { + if (is_null($mod->prevMod)) { $previous = 'first-' . $this->prefix; } else { - $previous = Modification::typeToString($mod->getPrevious()->getType()) . '-' . $this->prefix . '-' - . $mod->getPrevious()->getID(); + $previous = Modification::typeToString($mod->prevMod->type) . '-' . $this->prefix . '-' + . $mod->prevMod->id; } $attrs['previous'] = $previous; - $changeId = Modification::typeToString($mod->getType()) . '-' + $this->prefix . '-' . $mod->getID(); + $changeId = Modification::typeToString($mod->type) . '-' + $this->prefix . '-' . $mod->id; $attrs['changeId'] = $changeId; - if (is_null($mod->getNext())) { + if (is_null($mod->nextMod )) { $next = 'last-' . $this->prefix; } else { - $next = Modification::typeToString($mod->getNext()->getType()) . '-' . $this->prefix . '-' - . $mod->getNext()->getID(); + $next = Modification::typeToString($mod->nextMod ->type) . '-' . $this->prefix . '-' + . $mod->nextMod ->id; } $attrs['next'] = $next; } @@ -1971,9 +1729,9 @@ class EchoingContentHandler{ } class DelegatingContentHandler{ - + private $delegate; - + function __construct($delegate){ $this->delegate=$delegate; } -- 2.20.1