From 30ea5f2d8bd31e6709a1bb772b0c65faf125c39b Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Thu, 2 Oct 2008 06:32:14 +0000 Subject: [PATCH] * Concept for diff-based compression using the new xdiff beta. Acheives massively better compression ratio compared to CGZ for articles which are larger than the deflate 32 KB sliding window. Works within the HistoryBlob architecture. * Fixed documentation in HistoryBlob.php, removed "todo document" for methods that are adequately documented in the interface. * Added testCompression.php for testing concatenated object compression ratio --- includes/AutoLoader.php | 1 + includes/HistoryBlob.php | 220 +++++++++++++++++------- maintenance/storage/testCompression.php | 70 ++++++++ 3 files changed, 230 insertions(+), 61 deletions(-) create mode 100644 maintenance/storage/testCompression.php diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index c1f9a2ef2f..77baf6089c 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -34,6 +34,7 @@ $wgAutoloadLocalClasses = array( 'Credits' => 'includes/Credits.php', 'DBABagOStuff' => 'includes/BagOStuff.php', 'DependencyWrapper' => 'includes/CacheDependency.php', + 'DiffHistoryBlob' => 'includes/HistoryBlob.php', 'DjVuImage' => 'includes/DjVuImage.php', 'DoubleReplacer' => 'includes/StringUtils.php', 'DoubleRedirectJob' => 'includes/DoubleRedirectJob.php', diff --git a/includes/HistoryBlob.php b/includes/HistoryBlob.php index 3772926dfd..2281e7b5bb 100644 --- a/includes/HistoryBlob.php +++ b/includes/HistoryBlob.php @@ -1,41 +1,33 @@ uncompress(); - $this->mItems['meta'] = $metaData; - } - - /** @todo document */ - public function getMeta() { - $this->uncompress(); - return $this->mItems['meta']; - } - - /** @todo document */ public function addItem( $text ) { $this->uncompress(); $hash = md5( $text ); $this->mItems[$hash] = $text; $this->mSize += strlen( $text ); - $stub = new HistoryBlobStub( $hash ); - return $stub; + return $hash; } - /** @todo document */ public function getItem( $hash ) { $this->uncompress(); if ( array_key_exists( $hash, $this->mItems ) ) { @@ -97,29 +70,28 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob } } - /** @todo document */ public function setText( $text ) { $this->uncompress(); $stub = $this->addItem( $text ); $this->mDefaultHash = $stub->mHash; } - /** @todo document */ public function getText() { $this->uncompress(); return $this->getItem( $this->mDefaultHash ); } - # HistoryBlob implemented. - - - /** @todo document */ + /** + * Remove an item + */ public function removeItem( $hash ) { $this->mSize -= strlen( $this->mItems[$hash] ); unset( $this->mItems[$hash] ); } - /** @todo document */ + /** + * Compress the bulk data in the object + */ public function compress() { if ( !$this->mCompressed ) { $this->mItems = gzdeflate( serialize( $this->mItems ) ); @@ -127,7 +99,9 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob } } - /** @todo document */ + /** + * Uncompress bulk data + */ public function uncompress() { if ( $this->mCompressed ) { $this->mItems = unserialize( gzinflate( $this->mItems ) ); @@ -136,19 +110,18 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob } - /** @todo document */ function __sleep() { $this->compress(); return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' ); } - /** @todo document */ function __wakeup() { $this->uncompress(); } /** - * Determines if this object is happy + * Helper function for compression jobs + * Returns true until the object is "full" and ready to be committed */ public function isHappy( $maxFactor, $factorThreshold ) { if ( count( $this->mItems ) == 0 ) { @@ -184,12 +157,15 @@ $wgBlobCache = array(); /** - * @todo document (needs one-sentence top-level class description + some function descriptions). + * Pointer object for an item within a CGZ blob stored in the text table. */ class HistoryBlobStub { var $mOldId, $mHash, $mRef; - /** @todo document */ + /** + * @param string $hash The content hash of the text + * @param integer $oldid The old_id for the CGZ object + */ function HistoryBlobStub( $hash = '', $oldid = 0 ) { $this->mHash = $hash; } @@ -216,7 +192,6 @@ class HistoryBlobStub { return $this->mRef; } - /** @todo document */ function getText() { $fname = 'HistoryBlobStub::getText'; global $wgBlobCache; @@ -264,7 +239,9 @@ class HistoryBlobStub { return $obj->getItem( $this->mHash ); } - /** @todo document */ + /** + * Get the content hash + */ function getHash() { return $this->mHash; } @@ -282,7 +259,9 @@ class HistoryBlobStub { class HistoryBlobCurStub { var $mCurId; - /** @todo document */ + /** + * @param integer $curid The cur_id pointed to + */ function HistoryBlobCurStub( $curid = 0 ) { $this->mCurId = $curid; } @@ -295,7 +274,6 @@ class HistoryBlobCurStub { $this->mCurId = $id; } - /** @todo document */ function getText() { $dbr = wfGetDB( DB_SLAVE ); $row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) ); @@ -305,3 +283,123 @@ class HistoryBlobCurStub { return $row->cur_text; } } + +/** + * Diff-based history compression + * Requires xdiff 1.5+ and zlib + */ +class DiffHistoryBlob implements HistoryBlob { + /** Uncompressed item cache */ + var $mItems = array(); + + /** + * Array of diffs, where $this->mDiffs[0] is the diff between + * $this->mDiffs[0] and $this->mDiffs[1] + */ + var $mDiffs = array(); + + /** + * The key for getText() + */ + var $mDefaultKey; + + /** + * Compressed storage + */ + var $mCompressed; + + /** + * True if the object is locked against further writes + */ + var $mFrozen = false; + + + function __construct() { + if ( !function_exists( 'xdiff_string_bdiff' ) ){ + throw new MWException( "Need xdiff 1.5+ support to read or write DiffHistoryBlob\n" ); + } + if ( !function_exists( 'gzdeflate' ) ) { + throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" ); + } + } + + function addItem( $text ) { + if ( $this->mFrozen ) { + throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" ); + } + + $this->mItems[] = $text; + $i = count( $this->mItems ) - 1; + if ( $i > 0 ) { + # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff + # "String is not zero-terminated" + wfSuppressWarnings(); + $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . ''; + wfRestoreWarnings(); + } + return $i; + } + + function getItem( $key ) { + if ( $key > count( $this->mDiffs ) + 1 ) { + return false; + } + $key = intval( $key ); + if ( $key == 0 ) { + return $this->mItems[0]; + } + + $last = count( $this->mItems ) - 1; + for ( $i = $last + 1; $i <= $key; $i++ ) { + # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff + # "String is not zero-terminated" + wfSuppressWarnings(); + $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . ''; + wfRestoreWarnings(); + } + return $this->mItems[$key]; + } + + function setText( $text ) { + $this->mDefaultKey = $this->addItem( $text ); + } + + function getText() { + return $this->getItem( $this->mDefaultKey ); + } + + function __sleep() { + if ( !isset( $this->mItems[0] ) ) { + // Empty object + $info = false; + } else { + $info = array( + 'base' => $this->mItems[0], + 'diffs' => $this->mDiffs + ); + } + if ( isset( $this->mDefaultKey ) ) { + $info['default'] = $this->mDefaultKey; + } + $this->mCompressed = gzdeflate( serialize( $info ) ); + return array( 'mCompressed' ); + } + + function __wakeup() { + // addItem() doesn't work if mItems is partially filled from mDiffs + $this->mFrozen = true; + $info = unserialize( gzinflate( $this->mCompressed ) ); + unset( $this->mCompressed ); + + if ( !$info ) { + // Empty object + return; + } + + if ( isset( $info['default'] ) ) { + $this->mDefaultKey = $info['default']; + } + $this->mItems[0] = $info['base']; + $this->mDiffs = $info['diffs']; + } +} diff --git a/maintenance/storage/testCompression.php b/maintenance/storage/testCompression.php new file mode 100644 index 0000000000..eaf7e35fa7 --- /dev/null +++ b/maintenance/storage/testCompression.php @@ -0,0 +1,70 @@ +] [--start=] [--limit=] \n"; + exit( 1 ); +} + +$title = Title::newFromText( $args[0] ); +if ( isset( $options['start'] ) ) { + $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) ); + echo "Starting from " . $wgLang->timeanddate( $start ) . "\n"; +} else { + $start = '19700101000000'; +} +$limit = isset( $options['limit'] ) ? $options['limit'] : 10; +$type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob'; + + +$dbr = wfGetDB( DB_SLAVE ); +$res = $dbr->select( + array( 'page', 'revision', 'text' ), + '*', + array( + 'page_namespace' => $title->getNamespace(), + 'page_title' => $title->getDBkey(), + 'page_id=rev_page', + 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ), + 'rev_text_id=old_id' + ), __FILE__, array( 'LIMIT' => $limit ) +); + +$blob = new $type; +$hashes = array(); +$keys = array(); +$uncompressedSize = 0; +$t = -microtime( true ); +foreach ( $res as $row ) { + $revision = new Revision( $row ); + $text = $revision->getText(); + $uncompressedSize += strlen( $text ); + $hashes[$row->rev_id] = md5( $text ); + $keys[$row->rev_id] = $blob->addItem( $text ); +} + +$serialized = serialize( $blob ); +$t += microtime( true ); + +printf( "Compression ratio for %d revisions: %5.2f, %s -> %s\n", + $res->numRows(), + $uncompressedSize / strlen( $serialized ), + $wgLang->formatSize( $uncompressedSize ), + $wgLang->formatSize( strlen( $serialized ) ) +); +printf( "Compression time: %5.2f ms\n", $t * 1000 ); + +$t = -microtime( true ); +$blob = unserialize( $serialized ); +foreach ( $keys as $id => $key ) { + $text = $blob->getItem( $key ); + if ( md5( $text ) != $hashes[$id] ) { + echo "Content hash mismatch for rev_id $id\n"; + #var_dump( $text ); + } +} +$t += microtime( true ); +printf( "Decompression time: %5.2f ms\n", $t * 1000 ); + -- 2.20.1