From: Max Semenik Date: Wed, 27 Apr 2016 01:10:26 +0000 (-0700) Subject: Rethink diff limits X-Git-Tag: 1.31.0-rc.0~6729 X-Git-Url: http://git.cyclocoop.org/%22%20.%20generer_url_ecrire%28%22statistiques_visites%22%2C%22%22%29%20.%20%22?a=commitdiff_plain;h=dd57ff3cce86f3055c249a414c49b74f2bcb3497;p=lhc%2Fweb%2Fwiklou.git Rethink diff limits Now, instead of "if your changed paragraphs are larger than 10Kb, you're screwed": * Instead of relying on overall length, estimate complexity after splitting to words and taking any equal head and tail out of equation. * Estimate based on words changed, which better reflects the actual complexity of generating a diff. * New limit is determined scientifically, i.e. "above that number XDebug starts complaining about recursion limits reached in Vagrant". Caveat: if new limits are hit, the consequences are more widespread as all adjacent changed paragraphs are displayed without word level diffs, as opposed to only the paragraph that's too long being affected. However, the new limit is much higher and in wikitext you're supposed to put empty lines between paragraphs anyway, negating this problem. Bug: T128697 Change-Id: I4e91c7c40f5afdd116b847a859b8517522302489 --- diff --git a/RELEASE-NOTES-1.28 b/RELEASE-NOTES-1.28 index 6348ac385c..a650a50254 100644 --- a/RELEASE-NOTES-1.28 +++ b/RELEASE-NOTES-1.28 @@ -43,7 +43,7 @@ regularly. Below only new and removed languages are listed, as well as changes to languages because of Phabricator reports. === Other changes in 1.28 === - +* (T128697) Improved handling of large diffs. == Compatibility == diff --git a/autoload.php b/autoload.php index 8ff952c51c..27da2ca050 100644 --- a/autoload.php +++ b/autoload.php @@ -830,6 +830,7 @@ $wgAutoloadLocalClasses = [ 'MediaWiki\\Auth\\Throttler' => __DIR__ . '/includes/auth/Throttler.php', 'MediaWiki\\Auth\\UserDataAuthenticationRequest' => __DIR__ . '/includes/auth/UserDataAuthenticationRequest.php', 'MediaWiki\\Auth\\UsernameAuthenticationRequest' => __DIR__ . '/includes/auth/UsernameAuthenticationRequest.php', + 'MediaWiki\\Diff\\ComplexityException' => __DIR__ . '/includes/diff/ComplexityException.php', 'MediaWiki\\Diff\\WordAccumulator' => __DIR__ . '/includes/diff/WordAccumulator.php', 'MediaWiki\\Interwiki\\ClassicInterwikiLookup' => __DIR__ . '/includes/interwiki/ClassicInterwikiLookup.php', 'MediaWiki\\Interwiki\\InterwikiLookup' => __DIR__ . '/includes/interwiki/InterwikiLookup.php', diff --git a/includes/diff/ComplexityException.php b/includes/diff/ComplexityException.php new file mode 100644 index 0000000000..10ca964ac2 --- /dev/null +++ b/includes/diff/ComplexityException.php @@ -0,0 +1,30 @@ +setBailoutComplexity( $this->bailoutComplexity ); $this->edits = $eng->diff( $from_lines, $to_lines ); } diff --git a/includes/diff/DiffEngine.php b/includes/diff/DiffEngine.php index 1853b865a6..babd00b5d7 100644 --- a/includes/diff/DiffEngine.php +++ b/includes/diff/DiffEngine.php @@ -22,6 +22,7 @@ * @file * @ingroup DifferenceEngine */ +use MediaWiki\Diff\ComplexityException; /** * This diff implementation is mainly lifted from the LCS algorithm of the Eclipse project which @@ -51,6 +52,8 @@ class DiffEngine { private $tooLong; private $powLimit; + protected $bailoutComplexity = 0; + // State variables private $maxDifferences; private $lcsLengthCorrectedForHeuristic = false; @@ -71,6 +74,7 @@ class DiffEngine { * * @param string[] $from_lines * @param string[] $to_lines + * @throws ComplexityException * * @return DiffOp[] */ @@ -128,6 +132,14 @@ class DiffEngine { return $edits; } + /** + * Sets the complexity (in comparison operations) that can't be exceeded + * @param int $value + */ + public function setBailoutComplexity( $value ) { + $this->bailoutComplexity = $value; + } + /** * Adjust inserts/deletes of identical lines to join changes * as much as possible. @@ -265,6 +277,7 @@ class DiffEngine { /** * @param string[] $from * @param string[] $to + * @throws ComplexityException */ protected function diffInternal( array $from, array $to ) { // remember initial lengths @@ -323,6 +336,10 @@ class DiffEngine { $this->m = count( $this->from ); $this->n = count( $this->to ); + if ( $this->bailoutComplexity > 0 && $this->m * $this->n > $this->bailoutComplexity ) { + throw new ComplexityException(); + } + $this->removed = $this->m > 0 ? array_fill( 0, $this->m, true ) : []; $this->added = $this->n > 0 ? array_fill( 0, $this->n, true ) : []; diff --git a/includes/diff/WordLevelDiff.php b/includes/diff/WordLevelDiff.php index 12cf37671d..296e3b7493 100644 --- a/includes/diff/WordLevelDiff.php +++ b/includes/diff/WordLevelDiff.php @@ -23,6 +23,7 @@ * @defgroup DifferenceEngine DifferenceEngine */ +use MediaWiki\Diff\ComplexityException; use MediaWiki\Diff\WordAccumulator; /** @@ -31,7 +32,10 @@ use MediaWiki\Diff\WordAccumulator; * @ingroup DifferenceEngine */ class WordLevelDiff extends \Diff { - const MAX_LINE_LENGTH = 10000; + /** + * @inheritdoc + */ + protected $bailoutComplexity = 40000000; // Roughly 6K x 6K words changed /** * @param string[] $linesBefore @@ -42,7 +46,12 @@ class WordLevelDiff extends \Diff { list( $wordsBefore, $wordsBeforeStripped ) = $this->split( $linesBefore ); list( $wordsAfter, $wordsAfterStripped ) = $this->split( $linesAfter ); - parent::__construct( $wordsBeforeStripped, $wordsAfterStripped ); + try { + parent::__construct( $wordsBeforeStripped, $wordsAfterStripped ); + } catch ( ComplexityException $ex ) { + // Too hard to diff, just show whole paragraph(s) as changed + $this->edits = [ new DiffOpChange( $linesBefore, $linesAfter ) ]; + } $xi = $yi = 0; $editCount = count( $this->edits ); @@ -73,28 +82,20 @@ class WordLevelDiff extends \Diff { $stripped = []; $first = true; foreach ( $lines as $line ) { - # If the line is too long, just pretend the entire line is one big word - # This prevents resource exhaustion problems if ( $first ) { $first = false; } else { $words[] = "\n"; $stripped[] = "\n"; } - if ( strlen( $line ) > self::MAX_LINE_LENGTH ) { - $words[] = $line; - $stripped[] = $line; - } else { - $m = []; - if ( preg_match_all( '/ ( [^\S\n]+ | [0-9_A-Za-z\x80-\xff]+ | . ) (?: (?!< \n) [^\S\n])? /xs', - $line, $m ) - ) { - foreach ( $m[0] as $word ) { - $words[] = $word; - } - foreach ( $m[1] as $stripped_word ) { - $stripped[] = $stripped_word; - } + $m = []; + if ( preg_match_all( '/ ( [^\S\n]+ | [0-9_A-Za-z\x80-\xff]+ | . ) (?: (?!< \n) [^\S\n])? /xs', + $line, $m ) ) { + foreach ( $m[0] as $word ) { + $words[] = $word; + } + foreach ( $m[1] as $stripped_word ) { + $stripped[] = $stripped_word; } } }