From e08cd0de8432f1746a06a05b458f3d47280ae4aa Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Sun, 17 Apr 2016 17:39:25 -0700 Subject: [PATCH] Make wikidiff3 the only diff engine In addition to much improved worst-case performance, it also has better relevance. Bug: T128896 Change-Id: I3b52c502d7cd5923c5a02942afbe75aba9016148 --- RELEASE-NOTES-1.27 | 4 + includes/DefaultSettings.php | 4 +- includes/diff/DairikiDiff.php | 288 +---------------------------- includes/diff/DifferenceEngine.php | 6 +- 4 files changed, 13 insertions(+), 289 deletions(-) diff --git a/RELEASE-NOTES-1.27 b/RELEASE-NOTES-1.27 index 7ddcdfbae2..be15bde472 100644 --- a/RELEASE-NOTES-1.27 +++ b/RELEASE-NOTES-1.27 @@ -463,6 +463,10 @@ changes to languages because of Phabricator reports. * Skin::tooltipAndAccesskeyAttribs was removed (deprecated since 1.21). * Skin::userTalkLink was removed (deprecated since 1.21). * Skin::userToolLinksRedContribs was removed (deprecated since 1.21). +* wikidiff3 is now the default and only PHP diff engine. It provides improved diff + performance on complex changes. $wgExternalDiffEngine = 'wikidiff3' therefore + makes no difference now. Users are still recommended to use wikidiff2 if possible, + though. == Compatibility == diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index ccfb10afd4..13f7c4ef88 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -7812,9 +7812,9 @@ $wgUpdateRowsPerQuery = 100; /** * Name of the external diff engine to use. Supported values: - * * false: default PHP implementation, DairikiDiff + * * false: default PHP implementation * * 'wikidiff2': Wikimedia's fast difference engine implemented as a PHP/HHVM module - * * 'wikidiff3': newer PHP-based difference engine + * * 'wikidiff' and 'wikidiff3' are treated as false for backwards compatibility * * any other string is treated as a path to external diff executable */ $wgExternalDiffEngine = false; diff --git a/includes/diff/DairikiDiff.php b/includes/diff/DairikiDiff.php index 6272e7e992..e5e082feda 100644 --- a/includes/diff/DairikiDiff.php +++ b/includes/diff/DairikiDiff.php @@ -292,290 +292,10 @@ class DiffEngine { * @param string[] $to_lines */ private function diffLocal( $from_lines, $to_lines ) { - global $wgExternalDiffEngine; - - if ( $wgExternalDiffEngine == 'wikidiff3' ) { - // wikidiff3 - $wikidiff3 = new WikiDiff3(); - $wikidiff3->diff( $from_lines, $to_lines ); - $this->xchanged = $wikidiff3->removed; - $this->ychanged = $wikidiff3->added; - unset( $wikidiff3 ); - } else { - // old diff - $n_from = count( $from_lines ); - $n_to = count( $to_lines ); - $this->xchanged = $this->ychanged = []; - $this->xv = $this->yv = []; - $this->xind = $this->yind = []; - $this->seq = []; - $this->in_seq = []; - $this->lcs = 0; - - // Skip leading common lines. - for ( $skip = 0; $skip < $n_from && $skip < $n_to; $skip++ ) { - if ( $from_lines[$skip] !== $to_lines[$skip] ) { - break; - } - $this->xchanged[$skip] = $this->ychanged[$skip] = false; - } - // Skip trailing common lines. - $xi = $n_from; - $yi = $n_to; - for ( $endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++ ) { - if ( $from_lines[$xi] !== $to_lines[$yi] ) { - break; - } - $this->xchanged[$xi] = $this->ychanged[$yi] = false; - } - - // Ignore lines which do not exist in both files. - for ( $xi = $skip; $xi < $n_from - $endskip; $xi++ ) { - $xhash[$this->lineHash( $from_lines[$xi] )] = 1; - } - - for ( $yi = $skip; $yi < $n_to - $endskip; $yi++ ) { - $line = $to_lines[$yi]; - $this->ychanged[$yi] = empty( $xhash[$this->lineHash( $line )] ); - if ( $this->ychanged[$yi] ) { - continue; - } - $yhash[$this->lineHash( $line )] = 1; - $this->yv[] = $line; - $this->yind[] = $yi; - } - for ( $xi = $skip; $xi < $n_from - $endskip; $xi++ ) { - $line = $from_lines[$xi]; - $this->xchanged[$xi] = empty( $yhash[$this->lineHash( $line )] ); - if ( $this->xchanged[$xi] ) { - continue; - } - $this->xv[] = $line; - $this->xind[] = $xi; - } - - // Find the LCS. - $this->compareSeq( 0, count( $this->xv ), 0, count( $this->yv ) ); - } - } - - /** - * Returns the whole line if it's small enough, or the MD5 hash otherwise - * - * @param string $line - * - * @return string - */ - private function lineHash( $line ) { - if ( strlen( $line ) > self::MAX_XREF_LENGTH ) { - return md5( $line ); - } else { - return $line; - } - } - - /** - * Divide the Largest Common Subsequence (LCS) of the sequences - * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally - * sized segments. - * - * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an - * array of NCHUNKS+1 (X, Y) indexes giving the diving points between - * sub sequences. The first sub-sequence is contained in [X0, X1), - * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on. Note - * that (X0, Y0) == (XOFF, YOFF) and - * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM). - * - * This function assumes that the first lines of the specified portions - * of the two files do not match, and likewise that the last lines do not - * match. The caller must trim matching lines from the beginning and end - * of the portions it is going to specify. - * - * @param int $xoff - * @param int $xlim - * @param int $yoff - * @param int $ylim - * @param int $nchunks - * - * @return array List of two elements, integer and array[]. - */ - private function diag( $xoff, $xlim, $yoff, $ylim, $nchunks ) { - $flip = false; - - if ( $xlim - $xoff > $ylim - $yoff ) { - // Things seems faster (I'm not sure I understand why) - // when the shortest sequence in X. - $flip = true; - list( $xoff, $xlim, $yoff, $ylim ) = [ $yoff, $ylim, $xoff, $xlim ]; - } - - if ( $flip ) { - for ( $i = $ylim - 1; $i >= $yoff; $i-- ) { - $ymatches[$this->xv[$i]][] = $i; - } - } else { - for ( $i = $ylim - 1; $i >= $yoff; $i-- ) { - $ymatches[$this->yv[$i]][] = $i; - } - } - - $this->lcs = 0; - $this->seq[0] = $yoff - 1; - $this->in_seq = []; - $ymids[0] = []; - - $numer = $xlim - $xoff + $nchunks - 1; - $x = $xoff; - for ( $chunk = 0; $chunk < $nchunks; $chunk++ ) { - if ( $chunk > 0 ) { - for ( $i = 0; $i <= $this->lcs; $i++ ) { - $ymids[$i][$chunk - 1] = $this->seq[$i]; - } - } - - $x1 = $xoff + (int)( ( $numer + ( $xlim - $xoff ) * $chunk ) / $nchunks ); - // @codingStandardsIgnoreStart Ignore Squiz.WhiteSpace.SemicolonSpacing.Incorrect - for ( ; $x < $x1; $x++ ) { - // @codingStandardsIgnoreEnd - $line = $flip ? $this->yv[$x] : $this->xv[$x]; - if ( empty( $ymatches[$line] ) ) { - continue; - } - - $k = 0; - $matches = $ymatches[$line]; - reset( $matches ); - while ( list( , $y ) = each( $matches ) ) { - if ( empty( $this->in_seq[$y] ) ) { - $k = $this->lcsPos( $y ); - assert( $k > 0 ); - $ymids[$k] = $ymids[$k - 1]; - break; - } - } - - while ( list( , $y ) = each( $matches ) ) { - if ( $y > $this->seq[$k - 1] ) { - assert( $y < $this->seq[$k] ); - // Optimization: this is a common case: - // next match is just replacing previous match. - $this->in_seq[$this->seq[$k]] = false; - $this->seq[$k] = $y; - $this->in_seq[$y] = 1; - } elseif ( empty( $this->in_seq[$y] ) ) { - $k = $this->lcsPos( $y ); - assert( $k > 0 ); - $ymids[$k] = $ymids[$k - 1]; - } - } - } - } - - $seps[] = $flip ? [ $yoff, $xoff ] : [ $xoff, $yoff ]; - $ymid = $ymids[$this->lcs]; - for ( $n = 0; $n < $nchunks - 1; $n++ ) { - $x1 = $xoff + (int)( ( $numer + ( $xlim - $xoff ) * $n ) / $nchunks ); - $y1 = $ymid[$n] + 1; - $seps[] = $flip ? [ $y1, $x1 ] : [ $x1, $y1 ]; - } - $seps[] = $flip ? [ $ylim, $xlim ] : [ $xlim, $ylim ]; - - return [ $this->lcs, $seps ]; - } - - /** - * @param int $ypos - * - * @return int - */ - private function lcsPos( $ypos ) { - $end = $this->lcs; - if ( $end == 0 || $ypos > $this->seq[$end] ) { - $this->seq[++$this->lcs] = $ypos; - $this->in_seq[$ypos] = 1; - - return $this->lcs; - } - - $beg = 1; - while ( $beg < $end ) { - $mid = (int)( ( $beg + $end ) / 2 ); - if ( $ypos > $this->seq[$mid] ) { - $beg = $mid + 1; - } else { - $end = $mid; - } - } - - assert( $ypos != $this->seq[$end] ); - - $this->in_seq[$this->seq[$end]] = false; - $this->seq[$end] = $ypos; - $this->in_seq[$ypos] = 1; - - return $end; - } - - /** - * Find LCS of two sequences. - * - * The results are recorded in the vectors $this->{x,y}changed[], by - * storing a 1 in the element for each line that is an insertion - * or deletion (ie. is not in the LCS). - * - * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. - * - * Note that XLIM, YLIM are exclusive bounds. - * All line numbers are origin-0 and discarded lines are not counted. - * - * @param int $xoff - * @param int $xlim - * @param int $yoff - * @param int $ylim - */ - private function compareSeq( $xoff, $xlim, $yoff, $ylim ) { - // Slide down the bottom initial diagonal. - while ( $xoff < $xlim && $yoff < $ylim && $this->xv[$xoff] == $this->yv[$yoff] ) { - ++$xoff; - ++$yoff; - } - - // Slide up the top initial diagonal. - while ( $xlim > $xoff && $ylim > $yoff - && $this->xv[$xlim - 1] == $this->yv[$ylim - 1] - ) { - --$xlim; - --$ylim; - } - - if ( $xoff == $xlim || $yoff == $ylim ) { - $lcs = 0; - } else { - // This is ad hoc but seems to work well. - // $nchunks = sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5); - // $nchunks = max(2,min(8,(int)$nchunks)); - $nchunks = min( 7, $xlim - $xoff, $ylim - $yoff ) + 1; - list( $lcs, $seps ) = $this->diag( $xoff, $xlim, $yoff, $ylim, $nchunks ); - } - - if ( $lcs == 0 ) { - // X and Y sequences have no common subsequence: - // mark all changed. - while ( $yoff < $ylim ) { - $this->ychanged[$this->yind[$yoff++]] = 1; - } - while ( $xoff < $xlim ) { - $this->xchanged[$this->xind[$xoff++]] = 1; - } - } else { - // Use the partitions to split this problem into subproblems. - reset( $seps ); - $pt1 = $seps[0]; - while ( $pt2 = next( $seps ) ) { - $this->compareSeq( $pt1[0], $pt2[0], $pt1[1], $pt2[1] ); - $pt1 = $pt2; - } - } + $wikidiff3 = new WikiDiff3(); + $wikidiff3->diff( $from_lines, $to_lines ); + $this->xchanged = $wikidiff3->removed; + $this->ychanged = $wikidiff3->added; } /** diff --git a/includes/diff/DifferenceEngine.php b/includes/diff/DifferenceEngine.php index 1508cf1e83..e2345ca2ba 100644 --- a/includes/diff/DifferenceEngine.php +++ b/includes/diff/DifferenceEngine.php @@ -870,8 +870,8 @@ class DifferenceEngine extends ContextSource { $otext = str_replace( "\r\n", "\n", $otext ); $ntext = str_replace( "\r\n", "\n", $ntext ); - if ( $wgExternalDiffEngine == 'wikidiff' ) { - wfDeprecated( 'wikidiff support', '1.27' ); + if ( $wgExternalDiffEngine == 'wikidiff' || $wgExternalDiffEngine == 'wikidiff3' ) { + wfDeprecated( "\$wgExternalDiffEngine = '{$wgExternalDiffEngine}'", '1.27' ); $wgExternalDiffEngine = false; } @@ -884,7 +884,7 @@ class DifferenceEngine extends ContextSource { return $text; } - } elseif ( $wgExternalDiffEngine != 'wikidiff3' && $wgExternalDiffEngine !== false ) { + } elseif ( $wgExternalDiffEngine !== false ) { # Diff via the shell $tmpDir = wfTempDir(); $tempName1 = tempnam( $tmpDir, 'diff_' ); -- 2.20.1