Per Tim Starling, follow-up r76252: move WikiDiff.php to DairikiDiff.php to not confu...
[lhc/web/wiklou.git] / includes / diff / DairikiDiff.php
1 <?php
2 /**
3 * A PHP diff engine for phpwiki. (Taken from phpwiki-1.3.3)
4 *
5 * Copyright © 2000, 2001 Geoffrey T. Dairiki <dairiki@dairiki.org>
6 * You may copy this code freely under the conditions of the GPL.
7 *
8 * @file
9 * @ingroup DifferenceEngine
10 * @defgroup DifferenceEngine DifferenceEngine
11 */
12
13 /**
14 * @todo document
15 * @private
16 * @ingroup DifferenceEngine
17 */
18 class _DiffOp {
19 var $type;
20 var $orig;
21 var $closing;
22
23 function reverse() {
24 trigger_error( 'pure virtual', E_USER_ERROR );
25 }
26
27 function norig() {
28 return $this->orig ? sizeof( $this->orig ) : 0;
29 }
30
31 function nclosing() {
32 return $this->closing ? sizeof( $this->closing ) : 0;
33 }
34 }
35
36 /**
37 * @todo document
38 * @private
39 * @ingroup DifferenceEngine
40 */
41 class _DiffOp_Copy extends _DiffOp {
42 var $type = 'copy';
43
44 function __construct ( $orig, $closing = false ) {
45 if ( !is_array( $closing ) )
46 $closing = $orig;
47 $this->orig = $orig;
48 $this->closing = $closing;
49 }
50
51 function reverse() {
52 return new _DiffOp_Copy( $this->closing, $this->orig );
53 }
54 }
55
56 /**
57 * @todo document
58 * @private
59 * @ingroup DifferenceEngine
60 */
61 class _DiffOp_Delete extends _DiffOp {
62 var $type = 'delete';
63
64 function __construct ( $lines ) {
65 $this->orig = $lines;
66 $this->closing = false;
67 }
68
69 function reverse() {
70 return new _DiffOp_Add( $this->orig );
71 }
72 }
73
74 /**
75 * @todo document
76 * @private
77 * @ingroup DifferenceEngine
78 */
79 class _DiffOp_Add extends _DiffOp {
80 var $type = 'add';
81
82 function __construct ( $lines ) {
83 $this->closing = $lines;
84 $this->orig = false;
85 }
86
87 function reverse() {
88 return new _DiffOp_Delete( $this->closing );
89 }
90 }
91
92 /**
93 * @todo document
94 * @private
95 * @ingroup DifferenceEngine
96 */
97 class _DiffOp_Change extends _DiffOp {
98 var $type = 'change';
99
100 function __construct ( $orig, $closing ) {
101 $this->orig = $orig;
102 $this->closing = $closing;
103 }
104
105 function reverse() {
106 return new _DiffOp_Change( $this->closing, $this->orig );
107 }
108 }
109
110 /**
111 * Class used internally by Diff to actually compute the diffs.
112 *
113 * The algorithm used here is mostly lifted from the perl module
114 * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
115 * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
116 *
117 * More ideas are taken from:
118 * http://www.ics.uci.edu/~eppstein/161/960229.html
119 *
120 * Some ideas are (and a bit of code) are from from analyze.c, from GNU
121 * diffutils-2.7, which can be found at:
122 * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
123 *
124 * closingly, some ideas (subdivision by NCHUNKS > 2, and some optimizations)
125 * are my own.
126 *
127 * Line length limits for robustness added by Tim Starling, 2005-08-31
128 * Alternative implementation added by Guy Van den Broeck, 2008-07-30
129 *
130 * @author Geoffrey T. Dairiki, Tim Starling, Guy Van den Broeck
131 * @private
132 * @ingroup DifferenceEngine
133 */
134 class _DiffEngine {
135
136 const MAX_XREF_LENGTH = 10000;
137
138 protected $xchanged, $ychanged;
139
140 protected $xv = array(), $yv = array();
141 protected $xind = array(), $yind = array();
142
143 protected $seq = array(), $in_seq = array();
144
145 protected $lcs = 0;
146
147 function diff ( $from_lines, $to_lines ) {
148 wfProfileIn( __METHOD__ );
149
150 // Diff and store locally
151 $this->diff_local( $from_lines, $to_lines );
152
153 // Merge edits when possible
154 $this->_shift_boundaries( $from_lines, $this->xchanged, $this->ychanged );
155 $this->_shift_boundaries( $to_lines, $this->ychanged, $this->xchanged );
156
157 // Compute the edit operations.
158 $n_from = sizeof( $from_lines );
159 $n_to = sizeof( $to_lines );
160
161 $edits = array();
162 $xi = $yi = 0;
163 while ( $xi < $n_from || $yi < $n_to ) {
164 assert( $yi < $n_to || $this->xchanged[$xi] );
165 assert( $xi < $n_from || $this->ychanged[$yi] );
166
167 // Skip matching "snake".
168 $copy = array();
169 while ( $xi < $n_from && $yi < $n_to
170 && !$this->xchanged[$xi] && !$this->ychanged[$yi] ) {
171 $copy[] = $from_lines[$xi++];
172 ++$yi;
173 }
174 if ( $copy ) {
175 $edits[] = new _DiffOp_Copy( $copy );
176 }
177
178 // Find deletes & adds.
179 $delete = array();
180 while ( $xi < $n_from && $this->xchanged[$xi] ) {
181 $delete[] = $from_lines[$xi++];
182 }
183
184 $add = array();
185 while ( $yi < $n_to && $this->ychanged[$yi] ) {
186 $add[] = $to_lines[$yi++];
187 }
188
189 if ( $delete && $add ) {
190 $edits[] = new _DiffOp_Change( $delete, $add );
191 } elseif ( $delete ) {
192 $edits[] = new _DiffOp_Delete( $delete );
193 } elseif ( $add ) {
194 $edits[] = new _DiffOp_Add( $add );
195 }
196 }
197 wfProfileOut( __METHOD__ );
198 return $edits;
199 }
200
201 function diff_local ( $from_lines, $to_lines ) {
202 global $wgExternalDiffEngine;
203 wfProfileIn( __METHOD__ );
204
205 if ( $wgExternalDiffEngine == 'wikidiff3' ) {
206 // wikidiff3
207 $wikidiff3 = new WikiDiff3();
208 $wikidiff3->diff( $from_lines, $to_lines );
209 $this->xchanged = $wikidiff3->removed;
210 $this->ychanged = $wikidiff3->added;
211 unset( $wikidiff3 );
212 } else {
213 // old diff
214 $n_from = sizeof( $from_lines );
215 $n_to = sizeof( $to_lines );
216 $this->xchanged = $this->ychanged = array();
217 $this->xv = $this->yv = array();
218 $this->xind = $this->yind = array();
219 unset( $this->seq );
220 unset( $this->in_seq );
221 unset( $this->lcs );
222
223 // Skip leading common lines.
224 for ( $skip = 0; $skip < $n_from && $skip < $n_to; $skip++ ) {
225 if ( $from_lines[$skip] !== $to_lines[$skip] )
226 break;
227 $this->xchanged[$skip] = $this->ychanged[$skip] = false;
228 }
229 // Skip trailing common lines.
230 $xi = $n_from; $yi = $n_to;
231 for ( $endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++ ) {
232 if ( $from_lines[$xi] !== $to_lines[$yi] )
233 break;
234 $this->xchanged[$xi] = $this->ychanged[$yi] = false;
235 }
236
237 // Ignore lines which do not exist in both files.
238 for ( $xi = $skip; $xi < $n_from - $endskip; $xi++ ) {
239 $xhash[$this->_line_hash( $from_lines[$xi] )] = 1;
240 }
241
242 for ( $yi = $skip; $yi < $n_to - $endskip; $yi++ ) {
243 $line = $to_lines[$yi];
244 if ( ( $this->ychanged[$yi] = empty( $xhash[$this->_line_hash( $line )] ) ) )
245 continue;
246 $yhash[$this->_line_hash( $line )] = 1;
247 $this->yv[] = $line;
248 $this->yind[] = $yi;
249 }
250 for ( $xi = $skip; $xi < $n_from - $endskip; $xi++ ) {
251 $line = $from_lines[$xi];
252 if ( ( $this->xchanged[$xi] = empty( $yhash[$this->_line_hash( $line )] ) ) )
253 continue;
254 $this->xv[] = $line;
255 $this->xind[] = $xi;
256 }
257
258 // Find the LCS.
259 $this->_compareseq( 0, sizeof( $this->xv ), 0, sizeof( $this->yv ) );
260 }
261 wfProfileOut( __METHOD__ );
262 }
263
264 /**
265 * Returns the whole line if it's small enough, or the MD5 hash otherwise
266 */
267 function _line_hash( $line ) {
268 if ( strlen( $line ) > self::MAX_XREF_LENGTH ) {
269 return md5( $line );
270 } else {
271 return $line;
272 }
273 }
274
275 /* Divide the Largest Common Subsequence (LCS) of the sequences
276 * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally
277 * sized segments.
278 *
279 * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an
280 * array of NCHUNKS+1 (X, Y) indexes giving the diving points between
281 * sub sequences. The first sub-sequence is contained in [X0, X1),
282 * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on. Note
283 * that (X0, Y0) == (XOFF, YOFF) and
284 * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
285 *
286 * This function assumes that the first lines of the specified portions
287 * of the two files do not match, and likewise that the last lines do not
288 * match. The caller must trim matching lines from the beginning and end
289 * of the portions it is going to specify.
290 */
291 function _diag ( $xoff, $xlim, $yoff, $ylim, $nchunks ) {
292 $flip = false;
293
294 if ( $xlim - $xoff > $ylim - $yoff ) {
295 // Things seems faster (I'm not sure I understand why)
296 // when the shortest sequence in X.
297 $flip = true;
298 list ( $xoff, $xlim, $yoff, $ylim ) = array( $yoff, $ylim, $xoff, $xlim );
299 }
300
301 if ( $flip ) {
302 for ( $i = $ylim - 1; $i >= $yoff; $i-- ) {
303 $ymatches[$this->xv[$i]][] = $i;
304 }
305 } else {
306 for ( $i = $ylim - 1; $i >= $yoff; $i-- ) {
307 $ymatches[$this->yv[$i]][] = $i;
308 }
309 }
310
311 $this->lcs = 0;
312 $this->seq[0] = $yoff - 1;
313 $this->in_seq = array();
314 $ymids[0] = array();
315
316 $numer = $xlim - $xoff + $nchunks - 1;
317 $x = $xoff;
318 for ( $chunk = 0; $chunk < $nchunks; $chunk++ ) {
319 if ( $chunk > 0 ) {
320 for ( $i = 0; $i <= $this->lcs; $i++ ) {
321 $ymids[$i][$chunk -1] = $this->seq[$i];
322 }
323 }
324
325 $x1 = $xoff + (int)( ( $numer + ( $xlim -$xoff ) * $chunk ) / $nchunks );
326 for ( ; $x < $x1; $x++ ) {
327 $line = $flip ? $this->yv[$x] : $this->xv[$x];
328 if ( empty( $ymatches[$line] ) ) {
329 continue;
330 }
331 $matches = $ymatches[$line];
332 reset( $matches );
333 while ( list( , $y ) = each( $matches ) ) {
334 if ( empty( $this->in_seq[$y] ) ) {
335 $k = $this->_lcs_pos( $y );
336 assert( $k > 0 );
337 $ymids[$k] = $ymids[$k -1];
338 break;
339 }
340 }
341 while ( list ( , $y ) = each( $matches ) ) {
342 if ( $y > $this->seq[$k -1] ) {
343 assert( $y < $this->seq[$k] );
344 // Optimization: this is a common case:
345 // next match is just replacing previous match.
346 $this->in_seq[$this->seq[$k]] = false;
347 $this->seq[$k] = $y;
348 $this->in_seq[$y] = 1;
349 } else if ( empty( $this->in_seq[$y] ) ) {
350 $k = $this->_lcs_pos( $y );
351 assert( $k > 0 );
352 $ymids[$k] = $ymids[$k -1];
353 }
354 }
355 }
356 }
357
358 $seps[] = $flip ? array( $yoff, $xoff ) : array( $xoff, $yoff );
359 $ymid = $ymids[$this->lcs];
360 for ( $n = 0; $n < $nchunks - 1; $n++ ) {
361 $x1 = $xoff + (int)( ( $numer + ( $xlim - $xoff ) * $n ) / $nchunks );
362 $y1 = $ymid[$n] + 1;
363 $seps[] = $flip ? array( $y1, $x1 ) : array( $x1, $y1 );
364 }
365 $seps[] = $flip ? array( $ylim, $xlim ) : array( $xlim, $ylim );
366
367 return array( $this->lcs, $seps );
368 }
369
370 function _lcs_pos ( $ypos ) {
371 $end = $this->lcs;
372 if ( $end == 0 || $ypos > $this->seq[$end] ) {
373 $this->seq[++$this->lcs] = $ypos;
374 $this->in_seq[$ypos] = 1;
375 return $this->lcs;
376 }
377
378 $beg = 1;
379 while ( $beg < $end ) {
380 $mid = (int)( ( $beg + $end ) / 2 );
381 if ( $ypos > $this->seq[$mid] )
382 $beg = $mid + 1;
383 else
384 $end = $mid;
385 }
386
387 assert( $ypos != $this->seq[$end] );
388
389 $this->in_seq[$this->seq[$end]] = false;
390 $this->seq[$end] = $ypos;
391 $this->in_seq[$ypos] = 1;
392 return $end;
393 }
394
395 /* Find LCS of two sequences.
396 *
397 * The results are recorded in the vectors $this->{x,y}changed[], by
398 * storing a 1 in the element for each line that is an insertion
399 * or deletion (ie. is not in the LCS).
400 *
401 * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
402 *
403 * Note that XLIM, YLIM are exclusive bounds.
404 * All line numbers are origin-0 and discarded lines are not counted.
405 */
406 function _compareseq ( $xoff, $xlim, $yoff, $ylim ) {
407 // Slide down the bottom initial diagonal.
408 while ( $xoff < $xlim && $yoff < $ylim
409 && $this->xv[$xoff] == $this->yv[$yoff] ) {
410 ++$xoff;
411 ++$yoff;
412 }
413
414 // Slide up the top initial diagonal.
415 while ( $xlim > $xoff && $ylim > $yoff
416 && $this->xv[$xlim - 1] == $this->yv[$ylim - 1] ) {
417 --$xlim;
418 --$ylim;
419 }
420
421 if ( $xoff == $xlim || $yoff == $ylim ) {
422 $lcs = 0;
423 } else {
424 // This is ad hoc but seems to work well.
425 // $nchunks = sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5);
426 // $nchunks = max(2,min(8,(int)$nchunks));
427 $nchunks = min( 7, $xlim - $xoff, $ylim - $yoff ) + 1;
428 list ( $lcs, $seps )
429 = $this->_diag( $xoff, $xlim, $yoff, $ylim, $nchunks );
430 }
431
432 if ( $lcs == 0 ) {
433 // X and Y sequences have no common subsequence:
434 // mark all changed.
435 while ( $yoff < $ylim ) {
436 $this->ychanged[$this->yind[$yoff++]] = 1;
437 }
438 while ( $xoff < $xlim ) {
439 $this->xchanged[$this->xind[$xoff++]] = 1;
440 }
441 } else {
442 // Use the partitions to split this problem into subproblems.
443 reset( $seps );
444 $pt1 = $seps[0];
445 while ( $pt2 = next( $seps ) ) {
446 $this->_compareseq ( $pt1[0], $pt2[0], $pt1[1], $pt2[1] );
447 $pt1 = $pt2;
448 }
449 }
450 }
451
452 /* Adjust inserts/deletes of identical lines to join changes
453 * as much as possible.
454 *
455 * We do something when a run of changed lines include a
456 * line at one end and has an excluded, identical line at the other.
457 * We are free to choose which identical line is included.
458 * `compareseq' usually chooses the one at the beginning,
459 * but usually it is cleaner to consider the following identical line
460 * to be the "change".
461 *
462 * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
463 */
464 function _shift_boundaries ( $lines, &$changed, $other_changed ) {
465 wfProfileIn( __METHOD__ );
466 $i = 0;
467 $j = 0;
468
469 assert( 'sizeof($lines) == sizeof($changed)' );
470 $len = sizeof( $lines );
471 $other_len = sizeof( $other_changed );
472
473 while ( 1 ) {
474 /*
475 * Scan forwards to find beginning of another run of changes.
476 * Also keep track of the corresponding point in the other file.
477 *
478 * Throughout this code, $i and $j are adjusted together so that
479 * the first $i elements of $changed and the first $j elements
480 * of $other_changed both contain the same number of zeros
481 * (unchanged lines).
482 * Furthermore, $j is always kept so that $j == $other_len or
483 * $other_changed[$j] == false.
484 */
485 while ( $j < $other_len && $other_changed[$j] ) {
486 $j++;
487 }
488
489 while ( $i < $len && ! $changed[$i] ) {
490 assert( '$j < $other_len && ! $other_changed[$j]' );
491 $i++; $j++;
492 while ( $j < $other_len && $other_changed[$j] )
493 $j++;
494 }
495
496 if ( $i == $len ) {
497 break;
498 }
499
500 $start = $i;
501
502 // Find the end of this run of changes.
503 while ( ++$i < $len && $changed[$i] ) {
504 continue;
505 }
506
507 do {
508 /*
509 * Record the length of this run of changes, so that
510 * we can later determine whether the run has grown.
511 */
512 $runlength = $i - $start;
513
514 /*
515 * Move the changed region back, so long as the
516 * previous unchanged line matches the last changed one.
517 * This merges with previous changed regions.
518 */
519 while ( $start > 0 && $lines[$start - 1] == $lines[$i - 1] ) {
520 $changed[--$start] = 1;
521 $changed[--$i] = false;
522 while ( $start > 0 && $changed[$start - 1] ) {
523 $start--;
524 }
525 assert( '$j > 0' );
526 while ( $other_changed[--$j] ) {
527 continue;
528 }
529 assert( '$j >= 0 && !$other_changed[$j]' );
530 }
531
532 /*
533 * Set CORRESPONDING to the end of the changed run, at the last
534 * point where it corresponds to a changed run in the other file.
535 * CORRESPONDING == LEN means no such point has been found.
536 */
537 $corresponding = $j < $other_len ? $i : $len;
538
539 /*
540 * Move the changed region forward, so long as the
541 * first changed line matches the following unchanged one.
542 * This merges with following changed regions.
543 * Do this second, so that if there are no merges,
544 * the changed region is moved forward as far as possible.
545 */
546 while ( $i < $len && $lines[$start] == $lines[$i] ) {
547 $changed[$start++] = false;
548 $changed[$i++] = 1;
549 while ( $i < $len && $changed[$i] ) {
550 $i++;
551 }
552
553 assert( '$j < $other_len && ! $other_changed[$j]' );
554 $j++;
555 if ( $j < $other_len && $other_changed[$j] ) {
556 $corresponding = $i;
557 while ( $j < $other_len && $other_changed[$j] ) {
558 $j++;
559 }
560 }
561 }
562 } while ( $runlength != $i - $start );
563
564 /*
565 * If possible, move the fully-merged run of changes
566 * back to a corresponding run in the other file.
567 */
568 while ( $corresponding < $i ) {
569 $changed[--$start] = 1;
570 $changed[--$i] = 0;
571 assert( '$j > 0' );
572 while ( $other_changed[--$j] ) {
573 continue;
574 }
575 assert( '$j >= 0 && !$other_changed[$j]' );
576 }
577 }
578 wfProfileOut( __METHOD__ );
579 }
580 }
581
582 /**
583 * Class representing a 'diff' between two sequences of strings.
584 * @todo document
585 * @private
586 * @ingroup DifferenceEngine
587 */
588 class Diff
589 {
590 var $edits;
591
592 /**
593 * Constructor.
594 * Computes diff between sequences of strings.
595 *
596 * @param $from_lines array An array of strings.
597 * (Typically these are lines from a file.)
598 * @param $to_lines array An array of strings.
599 */
600 function __construct( $from_lines, $to_lines ) {
601 $eng = new _DiffEngine;
602 $this->edits = $eng->diff( $from_lines, $to_lines );
603 // $this->_check($from_lines, $to_lines);
604 }
605
606 /**
607 * Compute reversed Diff.
608 *
609 * SYNOPSIS:
610 *
611 * $diff = new Diff($lines1, $lines2);
612 * $rev = $diff->reverse();
613 * @return object A Diff object representing the inverse of the
614 * original diff.
615 */
616 function reverse () {
617 $rev = $this;
618 $rev->edits = array();
619 foreach ( $this->edits as $edit ) {
620 $rev->edits[] = $edit->reverse();
621 }
622 return $rev;
623 }
624
625 /**
626 * Check for empty diff.
627 *
628 * @return bool True iff two sequences were identical.
629 */
630 function isEmpty () {
631 foreach ( $this->edits as $edit ) {
632 if ( $edit->type != 'copy' ) {
633 return false;
634 }
635 }
636 return true;
637 }
638
639 /**
640 * Compute the length of the Longest Common Subsequence (LCS).
641 *
642 * This is mostly for diagnostic purposed.
643 *
644 * @return int The length of the LCS.
645 */
646 function lcs () {
647 $lcs = 0;
648 foreach ( $this->edits as $edit ) {
649 if ( $edit->type == 'copy' ) {
650 $lcs += sizeof( $edit->orig );
651 }
652 }
653 return $lcs;
654 }
655
656 /**
657 * Get the original set of lines.
658 *
659 * This reconstructs the $from_lines parameter passed to the
660 * constructor.
661 *
662 * @return array The original sequence of strings.
663 */
664 function orig() {
665 $lines = array();
666
667 foreach ( $this->edits as $edit ) {
668 if ( $edit->orig ) {
669 array_splice( $lines, sizeof( $lines ), 0, $edit->orig );
670 }
671 }
672 return $lines;
673 }
674
675 /**
676 * Get the closing set of lines.
677 *
678 * This reconstructs the $to_lines parameter passed to the
679 * constructor.
680 *
681 * @return array The sequence of strings.
682 */
683 function closing() {
684 $lines = array();
685
686 foreach ( $this->edits as $edit ) {
687 if ( $edit->closing ) {
688 array_splice( $lines, sizeof( $lines ), 0, $edit->closing );
689 }
690 }
691 return $lines;
692 }
693
694 /**
695 * Check a Diff for validity.
696 *
697 * This is here only for debugging purposes.
698 */
699 function _check ( $from_lines, $to_lines ) {
700 wfProfileIn( __METHOD__ );
701 if ( serialize( $from_lines ) != serialize( $this->orig() ) ) {
702 trigger_error( "Reconstructed original doesn't match", E_USER_ERROR );
703 }
704 if ( serialize( $to_lines ) != serialize( $this->closing() ) ) {
705 trigger_error( "Reconstructed closing doesn't match", E_USER_ERROR );
706 }
707
708 $rev = $this->reverse();
709 if ( serialize( $to_lines ) != serialize( $rev->orig() ) ) {
710 trigger_error( "Reversed original doesn't match", E_USER_ERROR );
711 }
712 if ( serialize( $from_lines ) != serialize( $rev->closing() ) ) {
713 trigger_error( "Reversed closing doesn't match", E_USER_ERROR );
714 }
715
716
717 $prevtype = 'none';
718 foreach ( $this->edits as $edit ) {
719 if ( $prevtype == $edit->type ) {
720 trigger_error( "Edit sequence is non-optimal", E_USER_ERROR );
721 }
722 $prevtype = $edit->type;
723 }
724
725 $lcs = $this->lcs();
726 trigger_error( 'Diff okay: LCS = ' . $lcs, E_USER_NOTICE );
727 wfProfileOut( __METHOD__ );
728 }
729 }
730
731 /**
732 * @todo document, bad name.
733 * @private
734 * @ingroup DifferenceEngine
735 */
736 class MappedDiff extends Diff
737 {
738 /**
739 * Constructor.
740 *
741 * Computes diff between sequences of strings.
742 *
743 * This can be used to compute things like
744 * case-insensitve diffs, or diffs which ignore
745 * changes in white-space.
746 *
747 * @param $from_lines array An array of strings.
748 * (Typically these are lines from a file.)
749 *
750 * @param $to_lines array An array of strings.
751 *
752 * @param $mapped_from_lines array This array should
753 * have the same size number of elements as $from_lines.
754 * The elements in $mapped_from_lines and
755 * $mapped_to_lines are what is actually compared
756 * when computing the diff.
757 *
758 * @param $mapped_to_lines array This array should
759 * have the same number of elements as $to_lines.
760 */
761 function __construct( $from_lines, $to_lines,
762 $mapped_from_lines, $mapped_to_lines ) {
763 wfProfileIn( __METHOD__ );
764
765 assert( sizeof( $from_lines ) == sizeof( $mapped_from_lines ) );
766 assert( sizeof( $to_lines ) == sizeof( $mapped_to_lines ) );
767
768 parent::__construct( $mapped_from_lines, $mapped_to_lines );
769
770 $xi = $yi = 0;
771 for ( $i = 0; $i < sizeof( $this->edits ); $i++ ) {
772 $orig = &$this->edits[$i]->orig;
773 if ( is_array( $orig ) ) {
774 $orig = array_slice( $from_lines, $xi, sizeof( $orig ) );
775 $xi += sizeof( $orig );
776 }
777
778 $closing = &$this->edits[$i]->closing;
779 if ( is_array( $closing ) ) {
780 $closing = array_slice( $to_lines, $yi, sizeof( $closing ) );
781 $yi += sizeof( $closing );
782 }
783 }
784 wfProfileOut( __METHOD__ );
785 }
786 }
787
788 /**
789 * A class to format Diffs
790 *
791 * This class formats the diff in classic diff format.
792 * It is intended that this class be customized via inheritance,
793 * to obtain fancier outputs.
794 * @todo document
795 * @private
796 * @ingroup DifferenceEngine
797 */
798 class DiffFormatter {
799 /**
800 * Number of leading context "lines" to preserve.
801 *
802 * This should be left at zero for this class, but subclasses
803 * may want to set this to other values.
804 */
805 var $leading_context_lines = 0;
806
807 /**
808 * Number of trailing context "lines" to preserve.
809 *
810 * This should be left at zero for this class, but subclasses
811 * may want to set this to other values.
812 */
813 var $trailing_context_lines = 0;
814
815 /**
816 * Format a diff.
817 *
818 * @param $diff Diff A Diff object.
819 * @return string The formatted output.
820 */
821 function format( $diff ) {
822 wfProfileIn( __METHOD__ );
823
824 $xi = $yi = 1;
825 $block = false;
826 $context = array();
827
828 $nlead = $this->leading_context_lines;
829 $ntrail = $this->trailing_context_lines;
830
831 $this->_start_diff();
832
833 foreach ( $diff->edits as $edit ) {
834 if ( $edit->type == 'copy' ) {
835 if ( is_array( $block ) ) {
836 if ( sizeof( $edit->orig ) <= $nlead + $ntrail ) {
837 $block[] = $edit;
838 }
839 else {
840 if ( $ntrail ) {
841 $context = array_slice( $edit->orig, 0, $ntrail );
842 $block[] = new _DiffOp_Copy( $context );
843 }
844 $this->_block( $x0, $ntrail + $xi - $x0,
845 $y0, $ntrail + $yi - $y0,
846 $block );
847 $block = false;
848 }
849 }
850 $context = $edit->orig;
851 } else {
852 if ( ! is_array( $block ) ) {
853 $context = array_slice( $context, sizeof( $context ) - $nlead );
854 $x0 = $xi - sizeof( $context );
855 $y0 = $yi - sizeof( $context );
856 $block = array();
857 if ( $context ) {
858 $block[] = new _DiffOp_Copy( $context );
859 }
860 }
861 $block[] = $edit;
862 }
863
864 if ( $edit->orig ) {
865 $xi += sizeof( $edit->orig );
866 }
867 if ( $edit->closing ) {
868 $yi += sizeof( $edit->closing );
869 }
870 }
871
872 if ( is_array( $block ) ) {
873 $this->_block( $x0, $xi - $x0,
874 $y0, $yi - $y0,
875 $block );
876 }
877
878 $end = $this->_end_diff();
879 wfProfileOut( __METHOD__ );
880 return $end;
881 }
882
883 function _block( $xbeg, $xlen, $ybeg, $ylen, &$edits ) {
884 wfProfileIn( __METHOD__ );
885 $this->_start_block( $this->_block_header( $xbeg, $xlen, $ybeg, $ylen ) );
886 foreach ( $edits as $edit ) {
887 if ( $edit->type == 'copy' ) {
888 $this->_context( $edit->orig );
889 } elseif ( $edit->type == 'add' ) {
890 $this->_added( $edit->closing );
891 } elseif ( $edit->type == 'delete' ) {
892 $this->_deleted( $edit->orig );
893 } elseif ( $edit->type == 'change' ) {
894 $this->_changed( $edit->orig, $edit->closing );
895 } else {
896 trigger_error( 'Unknown edit type', E_USER_ERROR );
897 }
898 }
899 $this->_end_block();
900 wfProfileOut( __METHOD__ );
901 }
902
903 function _start_diff() {
904 ob_start();
905 }
906
907 function _end_diff() {
908 $val = ob_get_contents();
909 ob_end_clean();
910 return $val;
911 }
912
913 function _block_header( $xbeg, $xlen, $ybeg, $ylen ) {
914 if ( $xlen > 1 ) {
915 $xbeg .= "," . ( $xbeg + $xlen - 1 );
916 }
917 if ( $ylen > 1 ) {
918 $ybeg .= "," . ( $ybeg + $ylen - 1 );
919 }
920
921 return $xbeg . ( $xlen ? ( $ylen ? 'c' : 'd' ) : 'a' ) . $ybeg;
922 }
923
924 function _start_block( $header ) {
925 echo $header . "\n";
926 }
927
928 function _end_block() {
929 }
930
931 function _lines( $lines, $prefix = ' ' ) {
932 foreach ( $lines as $line ) {
933 echo "$prefix $line\n";
934 }
935 }
936
937 function _context( $lines ) {
938 $this->_lines( $lines );
939 }
940
941 function _added( $lines ) {
942 $this->_lines( $lines, '>' );
943 }
944 function _deleted( $lines ) {
945 $this->_lines( $lines, '<' );
946 }
947
948 function _changed( $orig, $closing ) {
949 $this->_deleted( $orig );
950 echo "---\n";
951 $this->_added( $closing );
952 }
953 }
954
955 /**
956 * A formatter that outputs unified diffs
957 * @ingroup DifferenceEngine
958 */
959
960 class UnifiedDiffFormatter extends DiffFormatter {
961 var $leading_context_lines = 2;
962 var $trailing_context_lines = 2;
963
964 function _added( $lines ) {
965 $this->_lines( $lines, '+' );
966 }
967 function _deleted( $lines ) {
968 $this->_lines( $lines, '-' );
969 }
970 function _changed( $orig, $closing ) {
971 $this->_deleted( $orig );
972 $this->_added( $closing );
973 }
974 function _block_header( $xbeg, $xlen, $ybeg, $ylen ) {
975 return "@@ -$xbeg,$xlen +$ybeg,$ylen @@";
976 }
977 }
978
979 /**
980 * A pseudo-formatter that just passes along the Diff::$edits array
981 * @ingroup DifferenceEngine
982 */
983 class ArrayDiffFormatter extends DiffFormatter {
984 function format( $diff ) {
985 $oldline = 1;
986 $newline = 1;
987 $retval = array();
988 foreach ( $diff->edits as $edit ) {
989 switch( $edit->type ) {
990 case 'add':
991 foreach ( $edit->closing as $l ) {
992 $retval[] = array(
993 'action' => 'add',
994 'new' => $l,
995 'newline' => $newline++
996 );
997 }
998 break;
999 case 'delete':
1000 foreach ( $edit->orig as $l ) {
1001 $retval[] = array(
1002 'action' => 'delete',
1003 'old' => $l,
1004 'oldline' => $oldline++,
1005 );
1006 }
1007 break;
1008 case 'change':
1009 foreach ( $edit->orig as $i => $l ) {
1010 $retval[] = array(
1011 'action' => 'change',
1012 'old' => $l,
1013 'new' => @$edit->closing[$i],
1014 'oldline' => $oldline++,
1015 'newline' => $newline++,
1016 );
1017 }
1018 break;
1019 case 'copy':
1020 $oldline += count( $edit->orig );
1021 $newline += count( $edit->orig );
1022 }
1023 }
1024 return $retval;
1025 }
1026 }
1027
1028 /**
1029 * Additions by Axel Boldt follow, partly taken from diff.php, phpwiki-1.3.3
1030 *
1031 */
1032
1033 define( 'NBSP', '&#160;' ); // iso-8859-x non-breaking space.
1034
1035 /**
1036 * @todo document
1037 * @private
1038 * @ingroup DifferenceEngine
1039 */
1040 class _HWLDF_WordAccumulator {
1041 function __construct () {
1042 $this->_lines = array();
1043 $this->_line = '';
1044 $this->_group = '';
1045 $this->_tag = '';
1046 }
1047
1048 function _flushGroup ( $new_tag ) {
1049 if ( $this->_group !== '' ) {
1050 if ( $this->_tag == 'ins' ) {
1051 $this->_line .= '<ins class="diffchange diffchange-inline">' .
1052 htmlspecialchars ( $this->_group ) . '</ins>';
1053 } elseif ( $this->_tag == 'del' ) {
1054 $this->_line .= '<del class="diffchange diffchange-inline">' .
1055 htmlspecialchars ( $this->_group ) . '</del>';
1056 } else {
1057 $this->_line .= htmlspecialchars ( $this->_group );
1058 }
1059 }
1060 $this->_group = '';
1061 $this->_tag = $new_tag;
1062 }
1063
1064 function _flushLine ( $new_tag ) {
1065 $this->_flushGroup( $new_tag );
1066 if ( $this->_line != '' ) {
1067 array_push ( $this->_lines, $this->_line );
1068 } else {
1069 # make empty lines visible by inserting an NBSP
1070 array_push ( $this->_lines, NBSP );
1071 }
1072 $this->_line = '';
1073 }
1074
1075 function addWords ( $words, $tag = '' ) {
1076 if ( $tag != $this->_tag ) {
1077 $this->_flushGroup( $tag );
1078 }
1079
1080 foreach ( $words as $word ) {
1081 // new-line should only come as first char of word.
1082 if ( $word == '' ) {
1083 continue;
1084 }
1085 if ( $word[0] == "\n" ) {
1086 $this->_flushLine( $tag );
1087 $word = substr( $word, 1 );
1088 }
1089 assert( !strstr( $word, "\n" ) );
1090 $this->_group .= $word;
1091 }
1092 }
1093
1094 function getLines() {
1095 $this->_flushLine( '~done' );
1096 return $this->_lines;
1097 }
1098 }
1099
1100 /**
1101 * @todo document
1102 * @private
1103 * @ingroup DifferenceEngine
1104 */
1105 class WordLevelDiff extends MappedDiff {
1106 const MAX_LINE_LENGTH = 10000;
1107
1108 function __construct ( $orig_lines, $closing_lines ) {
1109 wfProfileIn( __METHOD__ );
1110
1111 list ( $orig_words, $orig_stripped ) = $this->_split( $orig_lines );
1112 list ( $closing_words, $closing_stripped ) = $this->_split( $closing_lines );
1113
1114 parent::__construct( $orig_words, $closing_words,
1115 $orig_stripped, $closing_stripped );
1116 wfProfileOut( __METHOD__ );
1117 }
1118
1119 function _split( $lines ) {
1120 wfProfileIn( __METHOD__ );
1121
1122 $words = array();
1123 $stripped = array();
1124 $first = true;
1125 foreach ( $lines as $line ) {
1126 # If the line is too long, just pretend the entire line is one big word
1127 # This prevents resource exhaustion problems
1128 if ( $first ) {
1129 $first = false;
1130 } else {
1131 $words[] = "\n";
1132 $stripped[] = "\n";
1133 }
1134 if ( strlen( $line ) > self::MAX_LINE_LENGTH ) {
1135 $words[] = $line;
1136 $stripped[] = $line;
1137 } else {
1138 $m = array();
1139 if ( preg_match_all( '/ ( [^\S\n]+ | [0-9_A-Za-z\x80-\xff]+ | . ) (?: (?!< \n) [^\S\n])? /xs',
1140 $line, $m ) )
1141 {
1142 $words = array_merge( $words, $m[0] );
1143 $stripped = array_merge( $stripped, $m[1] );
1144 }
1145 }
1146 }
1147 wfProfileOut( __METHOD__ );
1148 return array( $words, $stripped );
1149 }
1150
1151 function orig () {
1152 wfProfileIn( __METHOD__ );
1153 $orig = new _HWLDF_WordAccumulator;
1154
1155 foreach ( $this->edits as $edit ) {
1156 if ( $edit->type == 'copy' ) {
1157 $orig->addWords( $edit->orig );
1158 } elseif ( $edit->orig ) {
1159 $orig->addWords( $edit->orig, 'del' );
1160 }
1161 }
1162 $lines = $orig->getLines();
1163 wfProfileOut( __METHOD__ );
1164 return $lines;
1165 }
1166
1167 function closing () {
1168 wfProfileIn( __METHOD__ );
1169 $closing = new _HWLDF_WordAccumulator;
1170
1171 foreach ( $this->edits as $edit ) {
1172 if ( $edit->type == 'copy' ) {
1173 $closing->addWords( $edit->closing );
1174 } elseif ( $edit->closing ) {
1175 $closing->addWords( $edit->closing, 'ins' );
1176 }
1177 }
1178 $lines = $closing->getLines();
1179 wfProfileOut( __METHOD__ );
1180 return $lines;
1181 }
1182 }
1183
1184 /**
1185 * Wikipedia Table style diff formatter.
1186 * @todo document
1187 * @private
1188 * @ingroup DifferenceEngine
1189 */
1190 class TableDiffFormatter extends DiffFormatter {
1191 function __construct() {
1192 $this->leading_context_lines = 2;
1193 $this->trailing_context_lines = 2;
1194 }
1195
1196 public static function escapeWhiteSpace( $msg ) {
1197 $msg = preg_replace( '/^ /m', '&#160; ', $msg );
1198 $msg = preg_replace( '/ $/m', ' &#160;', $msg );
1199 $msg = preg_replace( '/ /', '&#160; ', $msg );
1200 return $msg;
1201 }
1202
1203 function _block_header( $xbeg, $xlen, $ybeg, $ylen ) {
1204 $r = '<tr><td colspan="2" class="diff-lineno"><!--LINE ' . $xbeg . "--></td>\n" .
1205 '<td colspan="2" class="diff-lineno"><!--LINE ' . $ybeg . "--></td></tr>\n";
1206 return $r;
1207 }
1208
1209 function _start_block( $header ) {
1210 echo $header;
1211 }
1212
1213 function _end_block() {
1214 }
1215
1216 function _lines( $lines, $prefix = ' ', $color = 'white' ) {
1217 }
1218
1219 # HTML-escape parameter before calling this
1220 function addedLine( $line ) {
1221 return $this->wrapLine( '+', 'diff-addedline', $line );
1222 }
1223
1224 # HTML-escape parameter before calling this
1225 function deletedLine( $line ) {
1226 return $this->wrapLine( '−', 'diff-deletedline', $line );
1227 }
1228
1229 # HTML-escape parameter before calling this
1230 function contextLine( $line ) {
1231 return $this->wrapLine( '&#160;', 'diff-context', $line );
1232 }
1233
1234 private function wrapLine( $marker, $class, $line ) {
1235 if ( $line !== '' ) {
1236 // The <div> wrapper is needed for 'overflow: auto' style to scroll properly
1237 $line = Xml::tags( 'div', null, $this->escapeWhiteSpace( $line ) );
1238 }
1239 return "<td class='diff-marker'>$marker</td><td class='$class'>$line</td>";
1240 }
1241
1242 function emptyLine() {
1243 return '<td colspan="2">&#160;</td>';
1244 }
1245
1246 function _added( $lines ) {
1247 foreach ( $lines as $line ) {
1248 echo '<tr>' . $this->emptyLine() .
1249 $this->addedLine( '<ins class="diffchange">' .
1250 htmlspecialchars ( $line ) . '</ins>' ) . "</tr>\n";
1251 }
1252 }
1253
1254 function _deleted( $lines ) {
1255 foreach ( $lines as $line ) {
1256 echo '<tr>' . $this->deletedLine( '<del class="diffchange">' .
1257 htmlspecialchars ( $line ) . '</del>' ) .
1258 $this->emptyLine() . "</tr>\n";
1259 }
1260 }
1261
1262 function _context( $lines ) {
1263 foreach ( $lines as $line ) {
1264 echo '<tr>' .
1265 $this->contextLine( htmlspecialchars ( $line ) ) .
1266 $this->contextLine( htmlspecialchars ( $line ) ) . "</tr>\n";
1267 }
1268 }
1269
1270 function _changed( $orig, $closing ) {
1271 wfProfileIn( __METHOD__ );
1272
1273 $diff = new WordLevelDiff( $orig, $closing );
1274 $del = $diff->orig();
1275 $add = $diff->closing();
1276
1277 # Notice that WordLevelDiff returns HTML-escaped output.
1278 # Hence, we will be calling addedLine/deletedLine without HTML-escaping.
1279
1280 while ( $line = array_shift( $del ) ) {
1281 $aline = array_shift( $add );
1282 echo '<tr>' . $this->deletedLine( $line ) .
1283 $this->addedLine( $aline ) . "</tr>\n";
1284 }
1285 foreach ( $add as $line ) { # If any leftovers
1286 echo '<tr>' . $this->emptyLine() .
1287 $this->addedLine( $line ) . "</tr>\n";
1288 }
1289 wfProfileOut( __METHOD__ );
1290 }
1291 }