3 * A PHP diff engine for phpwiki. (Taken from phpwiki-1.3.3)
5 * Copyright © 2000, 2001 Geoffrey T. Dairiki <dairiki@dairiki.org>
6 * You may copy this code freely under the conditions of the GPL.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
24 * @ingroup DifferenceEngine
25 * @defgroup DifferenceEngine DifferenceEngine
29 * The base class for all other DiffOp classes.
31 * The classes that extend DiffOp are: DiffOpCopy, DiffOpDelete, DiffOpAdd and
32 * DiffOpChange. FakeDiffOp also extends DiffOp, but it is not located in this file.
35 * @ingroup DifferenceEngine
37 abstract class DiffOp
{
57 public function getType() {
64 public function getOrig() {
72 public function getClosing( $i = null ) {
74 return $this->closing
;
76 if ( array_key_exists( $i, $this->closing
) ) {
77 return $this->closing
[$i];
82 abstract public function reverse();
87 public function norig() {
88 return $this->orig ?
count( $this->orig
) : 0;
94 public function nclosing() {
95 return $this->closing ?
count( $this->closing
) : 0;
100 * Extends DiffOp. Used to mark strings that have been
101 * copied from one string array to the other.
104 * @ingroup DifferenceEngine
106 class DiffOpCopy
extends DiffOp
{
107 public $type = 'copy';
109 public function __construct( $orig, $closing = false ) {
110 if ( !is_array( $closing ) ) {
114 $this->closing
= $closing;
120 public function reverse() {
121 return new DiffOpCopy( $this->closing
, $this->orig
);
126 * Extends DiffOp. Used to mark strings that have been
127 * deleted from the first string array.
130 * @ingroup DifferenceEngine
132 class DiffOpDelete
extends DiffOp
{
133 public $type = 'delete';
135 public function __construct( $lines ) {
136 $this->orig
= $lines;
137 $this->closing
= false;
143 public function reverse() {
144 return new DiffOpAdd( $this->orig
);
149 * Extends DiffOp. Used to mark strings that have been
150 * added from the first string array.
153 * @ingroup DifferenceEngine
155 class DiffOpAdd
extends DiffOp
{
156 public $type = 'add';
158 public function __construct( $lines ) {
159 $this->closing
= $lines;
164 * @return DiffOpDelete
166 public function reverse() {
167 return new DiffOpDelete( $this->closing
);
172 * Extends DiffOp. Used to mark strings that have been
173 * changed from the first string array (both added and subtracted).
176 * @ingroup DifferenceEngine
178 class DiffOpChange
extends DiffOp
{
179 public $type = 'change';
181 public function __construct( $orig, $closing ) {
183 $this->closing
= $closing;
187 * @return DiffOpChange
189 public function reverse() {
190 return new DiffOpChange( $this->closing
, $this->orig
);
195 * Class used internally by Diff to actually compute the diffs.
197 * The algorithm used here is mostly lifted from the perl module
198 * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
199 * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
201 * More ideas are taken from:
202 * http://www.ics.uci.edu/~eppstein/161/960229.html
204 * Some ideas (and a bit of code) are from analyze.c, from GNU
205 * diffutils-2.7, which can be found at:
206 * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
208 * closingly, some ideas (subdivision by NCHUNKS > 2, and some optimizations)
211 * Line length limits for robustness added by Tim Starling, 2005-08-31
212 * Alternative implementation added by Guy Van den Broeck, 2008-07-30
214 * @author Geoffrey T. Dairiki, Tim Starling, Guy Van den Broeck
216 * @ingroup DifferenceEngine
219 const MAX_XREF_LENGTH
= 10000;
221 protected $xchanged, $ychanged;
223 protected $xv = [], $yv = [];
224 protected $xind = [], $yind = [];
226 protected $seq = [], $in_seq = [];
231 * @param string[] $from_lines
232 * @param string[] $to_lines
236 public function diff( $from_lines, $to_lines ) {
238 // Diff and store locally
239 $this->diffLocal( $from_lines, $to_lines );
241 // Merge edits when possible
242 $this->shiftBoundaries( $from_lines, $this->xchanged
, $this->ychanged
);
243 $this->shiftBoundaries( $to_lines, $this->ychanged
, $this->xchanged
);
245 // Compute the edit operations.
246 $n_from = count( $from_lines );
247 $n_to = count( $to_lines );
251 while ( $xi < $n_from ||
$yi < $n_to ) {
252 assert( $yi < $n_to ||
$this->xchanged
[$xi] );
253 assert( $xi < $n_from ||
$this->ychanged
[$yi] );
255 // Skip matching "snake".
257 while ( $xi < $n_from && $yi < $n_to
258 && !$this->xchanged
[$xi] && !$this->ychanged
[$yi]
260 $copy[] = $from_lines[$xi++
];
264 $edits[] = new DiffOpCopy( $copy );
267 // Find deletes & adds.
269 while ( $xi < $n_from && $this->xchanged
[$xi] ) {
270 $delete[] = $from_lines[$xi++
];
274 while ( $yi < $n_to && $this->ychanged
[$yi] ) {
275 $add[] = $to_lines[$yi++
];
278 if ( $delete && $add ) {
279 $edits[] = new DiffOpChange( $delete, $add );
280 } elseif ( $delete ) {
281 $edits[] = new DiffOpDelete( $delete );
283 $edits[] = new DiffOpAdd( $add );
291 * @param string[] $from_lines
292 * @param string[] $to_lines
294 private function diffLocal( $from_lines, $to_lines ) {
295 $wikidiff3 = new WikiDiff3();
296 $wikidiff3->diff( $from_lines, $to_lines );
297 $this->xchanged
= $wikidiff3->removed
;
298 $this->ychanged
= $wikidiff3->added
;
302 * Adjust inserts/deletes of identical lines to join changes
303 * as much as possible.
305 * We do something when a run of changed lines include a
306 * line at one end and has an excluded, identical line at the other.
307 * We are free to choose which identical line is included.
308 * `compareseq' usually chooses the one at the beginning,
309 * but usually it is cleaner to consider the following identical line
310 * to be the "change".
312 * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
314 private function shiftBoundaries( $lines, &$changed, $other_changed ) {
318 assert( count( $lines ) == count( $changed ) );
319 $len = count( $lines );
320 $other_len = count( $other_changed );
324 * Scan forwards to find beginning of another run of changes.
325 * Also keep track of the corresponding point in the other file.
327 * Throughout this code, $i and $j are adjusted together so that
328 * the first $i elements of $changed and the first $j elements
329 * of $other_changed both contain the same number of zeros
331 * Furthermore, $j is always kept so that $j == $other_len or
332 * $other_changed[$j] == false.
334 while ( $j < $other_len && $other_changed[$j] ) {
338 while ( $i < $len && !$changed[$i] ) {
339 assert( $j < $other_len && ! $other_changed[$j] );
342 while ( $j < $other_len && $other_changed[$j] ) {
353 // Find the end of this run of changes.
354 while ( ++
$i < $len && $changed[$i] ) {
360 * Record the length of this run of changes, so that
361 * we can later determine whether the run has grown.
363 $runlength = $i - $start;
366 * Move the changed region back, so long as the
367 * previous unchanged line matches the last changed one.
368 * This merges with previous changed regions.
370 while ( $start > 0 && $lines[$start - 1] == $lines[$i - 1] ) {
371 $changed[--$start] = 1;
372 $changed[--$i] = false;
373 while ( $start > 0 && $changed[$start - 1] ) {
377 while ( $other_changed[--$j] ) {
380 assert( $j >= 0 && !$other_changed[$j] );
384 * Set CORRESPONDING to the end of the changed run, at the last
385 * point where it corresponds to a changed run in the other file.
386 * CORRESPONDING == LEN means no such point has been found.
388 $corresponding = $j < $other_len ?
$i : $len;
391 * Move the changed region forward, so long as the
392 * first changed line matches the following unchanged one.
393 * This merges with following changed regions.
394 * Do this second, so that if there are no merges,
395 * the changed region is moved forward as far as possible.
397 while ( $i < $len && $lines[$start] == $lines[$i] ) {
398 $changed[$start++
] = false;
400 while ( $i < $len && $changed[$i] ) {
404 assert( $j < $other_len && ! $other_changed[$j] );
406 if ( $j < $other_len && $other_changed[$j] ) {
408 while ( $j < $other_len && $other_changed[$j] ) {
413 } while ( $runlength != $i - $start );
416 * If possible, move the fully-merged run of changes
417 * back to a corresponding run in the other file.
419 while ( $corresponding < $i ) {
420 $changed[--$start] = 1;
423 while ( $other_changed[--$j] ) {
426 assert( $j >= 0 && !$other_changed[$j] );
433 * Class representing a 'diff' between two sequences of strings.
436 * @ingroup DifferenceEngine
447 * Computes diff between sequences of strings.
449 * @param string[] $from_lines An array of strings.
450 * Typically these are lines from a file.
451 * @param string[] $to_lines An array of strings.
453 public function __construct( $from_lines, $to_lines ) {
454 $eng = new DiffEngine
;
455 $this->edits
= $eng->diff( $from_lines, $to_lines );
461 public function getEdits() {
466 * Compute reversed Diff.
470 * $diff = new Diff($lines1, $lines2);
471 * $rev = $diff->reverse();
473 * @return Object A Diff object representing the inverse of the
476 public function reverse() {
479 /** @var DiffOp $edit */
480 foreach ( $this->edits
as $edit ) {
481 $rev->edits
[] = $edit->reverse();
488 * Check for empty diff.
490 * @return bool True if two sequences were identical.
492 public function isEmpty() {
493 foreach ( $this->edits
as $edit ) {
494 if ( $edit->type
!= 'copy' ) {
503 * Compute the length of the Longest Common Subsequence (LCS).
505 * This is mostly for diagnostic purposed.
507 * @return int The length of the LCS.
509 public function lcs() {
511 foreach ( $this->edits
as $edit ) {
512 if ( $edit->type
== 'copy' ) {
513 $lcs +
= count( $edit->orig
);
521 * Get the original set of lines.
523 * This reconstructs the $from_lines parameter passed to the
526 * @return string[] The original sequence of strings.
528 public function orig() {
531 foreach ( $this->edits
as $edit ) {
533 array_splice( $lines, count( $lines ), 0, $edit->orig
);
541 * Get the closing set of lines.
543 * This reconstructs the $to_lines parameter passed to the
546 * @return string[] The sequence of strings.
548 public function closing() {
551 foreach ( $this->edits
as $edit ) {
552 if ( $edit->closing
) {
553 array_splice( $lines, count( $lines ), 0, $edit->closing
);
562 * @todo document, bad name.
564 * @ingroup DifferenceEngine
566 class MappedDiff
extends Diff
{
570 * Computes diff between sequences of strings.
572 * This can be used to compute things like
573 * case-insensitve diffs, or diffs which ignore
574 * changes in white-space.
576 * @param string[] $from_lines An array of strings.
577 * Typically these are lines from a file.
578 * @param string[] $to_lines An array of strings.
579 * @param string[] $mapped_from_lines This array should
580 * have the same size number of elements as $from_lines.
581 * The elements in $mapped_from_lines and
582 * $mapped_to_lines are what is actually compared
583 * when computing the diff.
584 * @param string[] $mapped_to_lines This array should
585 * have the same number of elements as $to_lines.
587 public function __construct( $from_lines, $to_lines,
588 $mapped_from_lines, $mapped_to_lines ) {
590 assert( count( $from_lines ) == count( $mapped_from_lines ) );
591 assert( count( $to_lines ) == count( $mapped_to_lines ) );
593 parent
::__construct( $mapped_from_lines, $mapped_to_lines );
596 $editCount = count( $this->edits
);
597 for ( $i = 0; $i < $editCount; $i++
) {
598 $orig = &$this->edits
[$i]->orig
;
599 if ( is_array( $orig ) ) {
600 $orig = array_slice( $from_lines, $xi, count( $orig ) );
601 $xi +
= count( $orig );
604 $closing = &$this->edits
[$i]->closing
;
605 if ( is_array( $closing ) ) {
606 $closing = array_slice( $to_lines, $yi, count( $closing ) );
607 $yi +
= count( $closing );
614 * Additions by Axel Boldt follow, partly taken from diff.php, phpwiki-1.3.3
620 * @ingroup DifferenceEngine
622 class HWLDFWordAccumulator
{
623 public $insClass = ' class="diffchange diffchange-inline"';
624 public $delClass = ' class="diffchange diffchange-inline"';
632 * @param string $new_tag
634 private function flushGroup( $new_tag ) {
635 if ( $this->group
!== '' ) {
636 if ( $this->tag
== 'ins' ) {
637 $this->line
.= "<ins{$this->insClass}>" .
638 htmlspecialchars( $this->group
) . '</ins>';
639 } elseif ( $this->tag
== 'del' ) {
640 $this->line
.= "<del{$this->delClass}>" .
641 htmlspecialchars( $this->group
) . '</del>';
643 $this->line
.= htmlspecialchars( $this->group
);
647 $this->tag
= $new_tag;
651 * @param string $new_tag
653 private function flushLine( $new_tag ) {
654 $this->flushGroup( $new_tag );
655 if ( $this->line
!= '' ) {
656 array_push( $this->lines
, $this->line
);
658 # make empty lines visible by inserting an NBSP
659 array_push( $this->lines
, ' ' );
665 * @param string[] $words
668 public function addWords( $words, $tag = '' ) {
669 if ( $tag != $this->tag
) {
670 $this->flushGroup( $tag );
673 foreach ( $words as $word ) {
674 // new-line should only come as first char of word.
678 if ( $word[0] == "\n" ) {
679 $this->flushLine( $tag );
680 $word = substr( $word, 1 );
682 assert( !strstr( $word, "\n" ) );
683 $this->group
.= $word;
690 public function getLines() {
691 $this->flushLine( '~done' );
700 * @ingroup DifferenceEngine
702 class WordLevelDiff
extends MappedDiff
{
703 const MAX_LINE_LENGTH
= 10000;
706 * @param string[] $orig_lines
707 * @param string[] $closing_lines
709 public function __construct( $orig_lines, $closing_lines ) {
711 list( $orig_words, $orig_stripped ) = $this->split( $orig_lines );
712 list( $closing_words, $closing_stripped ) = $this->split( $closing_lines );
714 parent
::__construct( $orig_words, $closing_words,
715 $orig_stripped, $closing_stripped );
719 * @param string[] $lines
723 private function split( $lines ) {
728 foreach ( $lines as $line ) {
729 # If the line is too long, just pretend the entire line is one big word
730 # This prevents resource exhaustion problems
737 if ( strlen( $line ) > self
::MAX_LINE_LENGTH
) {
742 if ( preg_match_all( '/ ( [^\S\n]+ | [0-9_A-Za-z\x80-\xff]+ | . ) (?: (?!< \n) [^\S\n])? /xs',
745 foreach ( $m[0] as $word ) {
748 foreach ( $m[1] as $stripped_word ) {
749 $stripped[] = $stripped_word;
755 return [ $words, $stripped ];
761 public function orig() {
762 $orig = new HWLDFWordAccumulator
;
764 foreach ( $this->edits
as $edit ) {
765 if ( $edit->type
== 'copy' ) {
766 $orig->addWords( $edit->orig
);
767 } elseif ( $edit->orig
) {
768 $orig->addWords( $edit->orig
, 'del' );
771 $lines = $orig->getLines();
779 public function closing() {
780 $closing = new HWLDFWordAccumulator
;
782 foreach ( $this->edits
as $edit ) {
783 if ( $edit->type
== 'copy' ) {
784 $closing->addWords( $edit->closing
);
785 } elseif ( $edit->closing
) {
786 $closing->addWords( $edit->closing
, 'ins' );
789 $lines = $closing->getLines();