3 * Efficient concatenated text storage.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
24 * Diff-based history compression
25 * Requires xdiff 1.5+ and zlib
27 class DiffHistoryBlob
implements HistoryBlob
{
28 /** @var array Uncompressed item cache */
31 /** @var int Total uncompressed size */
35 * @var array Array of diffs. If a diff D from A to B is notated D = B - A,
36 * and Z is an empty string:
38 * { item[map[i]] - item[map[i-1]] where i > 0
40 * { item[map[i]] - Z where i = 0
44 /** @var array The diff map, see above */
47 /** @var int The key for getText()
51 /** @var string Compressed storage */
54 /** @var bool True if the object is locked against further writes */
55 public $mFrozen = false;
58 * @var int The maximum uncompressed size before the object becomes sad
59 * Should be less than max_allowed_packet
61 public $mMaxSize = 10000000;
63 /** @var int The maximum number of text items before the object becomes sad */
64 public $mMaxCount = 100;
66 /** Constants from xdiff.h */
67 const XDL_BDOP_INS
= 1;
68 const XDL_BDOP_CPY
= 2;
69 const XDL_BDOP_INSB
= 3;
71 function __construct() {
72 if ( !function_exists( 'gzdeflate' ) ) {
73 throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
82 function addItem( $text ) {
83 if ( $this->mFrozen
) {
84 throw new MWException( __METHOD__
. ": Cannot add more items after sleep/wakeup" );
87 $this->mItems
[] = $text;
88 $this->mSize +
= strlen( $text );
89 $this->mDiffs
= null; // later
90 return count( $this->mItems
) - 1;
97 function getItem( $key ) {
98 return $this->mItems
[$key];
102 * @param string $text
104 function setText( $text ) {
105 $this->mDefaultKey
= $this->addItem( $text );
112 return $this->getItem( $this->mDefaultKey
);
116 * @throws MWException
118 function compress() {
119 if ( !function_exists( 'xdiff_string_rabdiff' ) ) {
120 throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" );
122 if ( isset( $this->mDiffs
) ) {
123 // Already compressed
126 if ( $this->mItems
=== [] ) {
130 // Create two diff sequences: one for main text and one for small text
145 $mItemsCount = count( $this->mItems
);
146 for ( $i = 0; $i < $mItemsCount; $i++
) {
147 $text = $this->mItems
[$i];
151 $mainTail = $sequences['main']['tail'];
152 if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
158 $seq =& $sequences[$seqName];
159 $tail = $seq['tail'];
160 $diff = $this->diff( $tail, $text );
161 $seq['diffs'][] = $diff;
163 $seq['tail'] = $text;
165 unset( $seq ); // unlink dangerous alias
167 // Knit the sequences together
170 $this->mDiffMap
= [];
171 foreach ( $sequences as $seq ) {
172 if ( $seq['diffs'] === [] ) {
175 if ( $tail === '' ) {
176 $this->mDiffs
[] = $seq['diffs'][0];
178 $head = $this->patch( '', $seq['diffs'][0] );
179 $this->mDiffs
[] = $this->diff( $tail, $head );
181 $this->mDiffMap
[] = $seq['map'][0];
182 $diffsCount = count( $seq['diffs'] );
183 for ( $i = 1; $i < $diffsCount; $i++
) {
184 $this->mDiffs
[] = $seq['diffs'][$i];
185 $this->mDiffMap
[] = $seq['map'][$i];
187 $tail = $seq['tail'];
196 function diff( $t1, $t2 ) {
197 # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
198 # "String is not zero-terminated"
199 Wikimedia\
suppressWarnings();
200 $diff = xdiff_string_rabdiff( $t1, $t2 ) . '';
201 Wikimedia\restoreWarnings
();
206 * @param string $base
207 * @param string $diff
208 * @return bool|string
210 function patch( $base, $diff ) {
211 if ( function_exists( 'xdiff_string_bpatch' ) ) {
212 Wikimedia\
suppressWarnings();
213 $text = xdiff_string_bpatch( $base, $diff ) . '';
214 Wikimedia\restoreWarnings
();
218 # Pure PHP implementation
220 $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
222 # Check the checksum if hash extension is available
223 $ofp = $this->xdiffAdler32( $base );
224 if ( $ofp !== false && $ofp !== substr( $diff, 0, 4 ) ) {
225 wfDebug( __METHOD__
. ": incorrect base checksum\n" );
228 if ( $header['csize'] != strlen( $base ) ) {
229 wfDebug( __METHOD__
. ": incorrect base length\n" );
235 while ( $p < strlen( $diff ) ) {
236 $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
240 case self
::XDL_BDOP_INS
:
241 $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
243 $out .= substr( $diff, $p, $x['size'] );
246 case self
::XDL_BDOP_INSB
:
247 $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
249 $out .= substr( $diff, $p, $x['csize'] );
252 case self
::XDL_BDOP_CPY
:
253 $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
255 $out .= substr( $base, $x['off'], $x['csize'] );
258 wfDebug( __METHOD__
. ": invalid op\n" );
266 * Compute a binary "Adler-32" checksum as defined by LibXDiff, i.e. with
267 * the bytes backwards and initialised with 0 instead of 1. See T36428.
270 * @return string|bool False if the hash extension is not available
272 function xdiffAdler32( $s ) {
273 if ( !function_exists( 'hash' ) ) {
278 if ( $init === null ) {
279 $init = str_repeat( "\xf0", 205 ) . "\xee" . str_repeat( "\xf0", 67 ) . "\x02";
282 // The real Adler-32 checksum of $init is zero, so it initialises the
283 // state to zero, as it is at the start of LibXDiff's checksum
284 // algorithm. Appending the subject string then simulates LibXDiff.
285 return strrev( hash( 'adler32', $init . $s, true ) );
288 function uncompress() {
289 if ( !$this->mDiffs
) {
293 $mDiffsCount = count( $this->mDiffs
);
294 for ( $diffKey = 0; $diffKey < $mDiffsCount; $diffKey++
) {
295 $textKey = $this->mDiffMap
[$diffKey];
296 $text = $this->patch( $tail, $this->mDiffs
[$diffKey] );
297 $this->mItems
[$textKey] = $text;
307 if ( $this->mItems
=== [] ) {
310 // Take forward differences to improve the compression ratio for sequences
313 foreach ( $this->mDiffMap
as $i ) {
321 'diffs' => $this->mDiffs
,
325 if ( isset( $this->mDefaultKey
) ) {
326 $info['default'] = $this->mDefaultKey
;
328 $this->mCompressed
= gzdeflate( serialize( $info ) );
329 return [ 'mCompressed' ];
332 function __wakeup() {
333 // addItem() doesn't work if mItems is partially filled from mDiffs
334 $this->mFrozen
= true;
335 $info = unserialize( gzinflate( $this->mCompressed
) );
336 unset( $this->mCompressed
);
343 if ( isset( $info['default'] ) ) {
344 $this->mDefaultKey
= $info['default'];
346 $this->mDiffs
= $info['diffs'];
347 if ( isset( $info['base'] ) ) {
349 $this->mDiffMap
= range( 0, count( $this->mDiffs
) - 1 );
350 array_unshift( $this->mDiffs
,
351 pack( 'VVCV', 0, 0, self
::XDL_BDOP_INSB
, strlen( $info['base'] ) ) .
355 $map = explode( ',', $info['map'] );
357 $this->mDiffMap
= [];
358 foreach ( $map as $i ) {
360 $this->mDiffMap
[] = $cur;
367 * Helper function for compression jobs
368 * Returns true until the object is "full" and ready to be committed
373 return $this->mSize
< $this->mMaxSize
374 && count( $this->mItems
) < $this->mMaxCount
;