From c924b2c3a1e4d8a28a3bcc762244e7c2c340e68a Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Sat, 1 Nov 2008 09:39:40 +0000 Subject: [PATCH] New DiffHistoryBlob storage format and compression method, which maintains a good compression ratio in the case of vandalism by replacement with small text. The storage format allows edit wars to be dealt with as well, but an efficient algorithm for that may be more difficult. --- includes/HistoryBlob.php | 175 +++++++++++++++++++++++++++++++-------- 1 file changed, 139 insertions(+), 36 deletions(-) diff --git a/includes/HistoryBlob.php b/includes/HistoryBlob.php index 8cd2945f24..7489399770 100644 --- a/includes/HistoryBlob.php +++ b/includes/HistoryBlob.php @@ -57,9 +57,10 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob public function addItem( $text ) { $this->uncompress(); $hash = md5( $text ); - $this->mItems[$hash] = $text; - $this->mSize += strlen( $text ); - + if ( !isset( $this->mItems[$hash] ) ) { + $this->mItems[$hash] = $text; + $this->mSize += strlen( $text ); + } return $hash; } @@ -277,11 +278,21 @@ class DiffHistoryBlob implements HistoryBlob { /** Uncompressed item cache */ var $mItems = array(); + /** Total uncompressed size */ + var $mSize = 0; + /** - * Array of diffs, where $this->mDiffs[0] is the diff between - * $this->mDiffs[0] and $this->mDiffs[1] + * Array of diffs. If a diff D from A to B is notated D = B - A, and Z is + * an empty string: + * + * { item[map[i]] - item[map[i-1]] where i > 0 + * diff[i] = { + * { item[map[i]] - Z where i = 0 */ - var $mDiffs = array(); + var $mDiffs; + + /** The diff map, see above */ + var $mDiffMap; /** * The key for getText() @@ -298,7 +309,6 @@ class DiffHistoryBlob implements HistoryBlob { */ var $mFrozen = false; - /** * The maximum uncompressed size before the object becomes sad * Should be less than max_allowed_packet @@ -326,34 +336,11 @@ class DiffHistoryBlob implements HistoryBlob { $this->mItems[] = $text; $this->mSize += strlen( $text ); - $i = count( $this->mItems ) - 1; - if ( $i > 0 ) { - # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff - # "String is not zero-terminated" - wfSuppressWarnings(); - $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . ''; - wfRestoreWarnings(); - } - return $i; + $this->mDiffs = null; // later + return count( $this->mItems ) - 1; } function getItem( $key ) { - if ( $key > count( $this->mDiffs ) + 1 ) { - return false; - } - $key = intval( $key ); - if ( $key == 0 ) { - return $this->mItems[0]; - } - - $last = count( $this->mItems ) - 1; - for ( $i = $last + 1; $i <= $key; $i++ ) { - # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff - # "String is not zero-terminated" - wfSuppressWarnings(); - $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . ''; - wfRestoreWarnings(); - } return $this->mItems[$key]; } @@ -365,14 +352,123 @@ class DiffHistoryBlob implements HistoryBlob { return $this->getItem( $this->mDefaultKey ); } + function compress() { + if ( isset( $this->mDiffs ) ) { + // Already compressed + return; + } + if ( !count( $this->mItems ) ) { + // Empty + return; + } + + // Create two diff sequences: one for main text and one for small text + $sequences = array( + 'small' => array( + 'tail' => '', + 'diffs' => array(), + 'map' => array(), + ), + 'main' => array( + 'tail' => '', + 'diffs' => array(), + 'map' => array(), + ), + ); + $smallFactor = 0.5; + + for ( $i = 0; $i < count( $this->mItems ); $i++ ) { + $text = $this->mItems[$i]; + if ( $i == 0 ) { + $seqName = 'main'; + } else { + $mainTail = $sequences['main']['tail']; + if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) { + $seqName = 'small'; + } else { + $seqName = 'main'; + } + } + $seq =& $sequences[$seqName]; + $tail = $seq['tail']; + $diff = $this->diff( $tail, $text ); + $seq['diffs'][] = $diff; + $seq['map'][] = $i; + $seq['tail'] = $text; + } + unset( $seq ); // unlink dangerous alias + + // Knit the sequences together + $tail = ''; + $this->mDiffs = array(); + $this->mDiffMap = array(); + foreach ( $sequences as $seq ) { + if ( !count( $seq['diffs'] ) ) { + continue; + } + if ( $tail === '' ) { + $this->mDiffs[] = $seq['diffs'][0]; + } else { + $head = $this->patch( '', $seq['diffs'][0] ); + $this->mDiffs[] = $this->diff( $tail, $head ); + } + $this->mDiffMap[] = $seq['map'][0]; + for ( $i = 1; $i < count( $seq['diffs'] ); $i++ ) { + $this->mDiffs[] = $seq['diffs'][$i]; + $this->mDiffMap[] = $seq['map'][$i]; + } + $tail = $seq['tail']; + } + } + + function diff( $t1, $t2 ) { + # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff + # "String is not zero-terminated" + wfSuppressWarnings(); + $diff = xdiff_string_bdiff( $t1, $t2 ) . ''; + wfRestoreWarnings(); + return $diff; + } + + function patch( $base, $diff ) { + wfSuppressWarnings(); + $text = xdiff_string_bpatch( $base, $diff ) . ''; + wfRestoreWarnings(); + return $text; + } + + function uncompress() { + if ( !$this->mDiffs ) { + return; + } + $tail = ''; + for ( $diffKey = 0; $diffKey < count( $this->mDiffs ); $diffKey++ ) { + $textKey = $this->mDiffMap[$diffKey]; + $text = $this->patch( $tail, $this->mDiffs[$diffKey] ); + $this->mItems[$textKey] = $text; + $tail = $text; + } + } + function __sleep() { - if ( !isset( $this->mItems[0] ) ) { + $this->compress(); + if ( !count( $this->mItems ) ) { // Empty object $info = false; } else { + // Take forward differences to improve the compression ratio for sequences + $map = ''; + $prev = 0; + foreach ( $this->mDiffMap as $i ) { + if ( $map !== '' ) { + $map .= ','; + } + $map .= $i - $prev; + $prev = $i; + } $info = array( - 'base' => $this->mItems[0], - 'diffs' => $this->mDiffs + 'diffs' => $this->mDiffs, + 'map' => $map ); } if ( isset( $this->mDefaultKey ) ) { @@ -396,8 +492,15 @@ class DiffHistoryBlob implements HistoryBlob { if ( isset( $info['default'] ) ) { $this->mDefaultKey = $info['default']; } - $this->mItems[0] = $info['base']; + $map = explode( ',', $info['map'] ); + $cur = 0; + $this->mDiffMap = array(); + foreach ( $map as $i ) { + $cur += $i; + $this->mDiffMap[] = $cur; + } $this->mDiffs = $info['diffs']; + $this->uncompress(); } /** -- 2.20.1