From e48a9beb8e036cfb9572e311a7f77b91712e9a2d Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Fri, 17 Oct 2008 09:11:43 +0000 Subject: [PATCH] Revert revert r41578 of r41531 and fix compressOld.php. --- includes/AutoLoader.php | 1 + includes/HistoryBlob.php | 223 +++++++++++++++++------- maintenance/storage/compressOld.inc | 6 +- maintenance/storage/testCompression.php | 70 ++++++++ 4 files changed, 232 insertions(+), 68 deletions(-) create mode 100644 maintenance/storage/testCompression.php diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index a4b96008be..d0f417f6ae 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -35,6 +35,7 @@ $wgAutoloadLocalClasses = array( 'Credits' => 'includes/Credits.php', 'DBABagOStuff' => 'includes/BagOStuff.php', 'DependencyWrapper' => 'includes/CacheDependency.php', + 'DiffHistoryBlob' => 'includes/HistoryBlob.php', 'DjVuImage' => 'includes/DjVuImage.php', 'DoubleReplacer' => 'includes/StringUtils.php', 'DoubleRedirectJob' => 'includes/DoubleRedirectJob.php', diff --git a/includes/HistoryBlob.php b/includes/HistoryBlob.php index 3772926dfd..b051adf143 100644 --- a/includes/HistoryBlob.php +++ b/includes/HistoryBlob.php @@ -1,41 +1,33 @@ uncompress(); - $this->mItems['meta'] = $metaData; - } - - /** @todo document */ - public function getMeta() { - $this->uncompress(); - return $this->mItems['meta']; - } - - /** @todo document */ public function addItem( $text ) { $this->uncompress(); $hash = md5( $text ); $this->mItems[$hash] = $text; $this->mSize += strlen( $text ); - $stub = new HistoryBlobStub( $hash ); - return $stub; + return $hash; } - /** @todo document */ public function getItem( $hash ) { $this->uncompress(); if ( array_key_exists( $hash, $this->mItems ) ) { @@ -97,29 +70,27 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob } } - /** @todo document */ public function setText( $text ) { $this->uncompress(); - $stub = $this->addItem( $text ); - $this->mDefaultHash = $stub->mHash; + $this->mDefaultHash = $this->addItem( $text ); } - /** @todo document */ public function getText() { $this->uncompress(); return $this->getItem( $this->mDefaultHash ); } - # HistoryBlob implemented. - - - /** @todo document */ + /** + * Remove an item + */ public function removeItem( $hash ) { $this->mSize -= strlen( $this->mItems[$hash] ); unset( $this->mItems[$hash] ); } - /** @todo document */ + /** + * Compress the bulk data in the object + */ public function compress() { if ( !$this->mCompressed ) { $this->mItems = gzdeflate( serialize( $this->mItems ) ); @@ -127,7 +98,9 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob } } - /** @todo document */ + /** + * Uncompress bulk data + */ public function uncompress() { if ( $this->mCompressed ) { $this->mItems = unserialize( gzinflate( $this->mItems ) ); @@ -136,19 +109,18 @@ class ConcatenatedGzipHistoryBlob implements HistoryBlob } - /** @todo document */ function __sleep() { $this->compress(); return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' ); } - /** @todo document */ function __wakeup() { $this->uncompress(); } /** - * Determines if this object is happy + * Helper function for compression jobs + * Returns true until the object is "full" and ready to be committed */ public function isHappy( $maxFactor, $factorThreshold ) { if ( count( $this->mItems ) == 0 ) { @@ -184,12 +156,15 @@ $wgBlobCache = array(); /** - * @todo document (needs one-sentence top-level class description + some function descriptions). + * Pointer object for an item within a CGZ blob stored in the text table. */ class HistoryBlobStub { var $mOldId, $mHash, $mRef; - /** @todo document */ + /** + * @param string $hash The content hash of the text + * @param integer $oldid The old_id for the CGZ object + */ function HistoryBlobStub( $hash = '', $oldid = 0 ) { $this->mHash = $hash; } @@ -216,7 +191,6 @@ class HistoryBlobStub { return $this->mRef; } - /** @todo document */ function getText() { $fname = 'HistoryBlobStub::getText'; global $wgBlobCache; @@ -264,7 +238,9 @@ class HistoryBlobStub { return $obj->getItem( $this->mHash ); } - /** @todo document */ + /** + * Get the content hash + */ function getHash() { return $this->mHash; } @@ -282,7 +258,9 @@ class HistoryBlobStub { class HistoryBlobCurStub { var $mCurId; - /** @todo document */ + /** + * @param integer $curid The cur_id pointed to + */ function HistoryBlobCurStub( $curid = 0 ) { $this->mCurId = $curid; } @@ -295,7 +273,6 @@ class HistoryBlobCurStub { $this->mCurId = $id; } - /** @todo document */ function getText() { $dbr = wfGetDB( DB_SLAVE ); $row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) ); @@ -305,3 +282,123 @@ class HistoryBlobCurStub { return $row->cur_text; } } + +/** + * Diff-based history compression + * Requires xdiff 1.5+ and zlib + */ +class DiffHistoryBlob implements HistoryBlob { + /** Uncompressed item cache */ + var $mItems = array(); + + /** + * Array of diffs, where $this->mDiffs[0] is the diff between + * $this->mDiffs[0] and $this->mDiffs[1] + */ + var $mDiffs = array(); + + /** + * The key for getText() + */ + var $mDefaultKey; + + /** + * Compressed storage + */ + var $mCompressed; + + /** + * True if the object is locked against further writes + */ + var $mFrozen = false; + + + function __construct() { + if ( !function_exists( 'xdiff_string_bdiff' ) ){ + throw new MWException( "Need xdiff 1.5+ support to read or write DiffHistoryBlob\n" ); + } + if ( !function_exists( 'gzdeflate' ) ) { + throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" ); + } + } + + function addItem( $text ) { + if ( $this->mFrozen ) { + throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" ); + } + + $this->mItems[] = $text; + $i = count( $this->mItems ) - 1; + if ( $i > 0 ) { + # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff + # "String is not zero-terminated" + wfSuppressWarnings(); + $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . ''; + wfRestoreWarnings(); + } + return $i; + } + + function getItem( $key ) { + if ( $key > count( $this->mDiffs ) + 1 ) { + return false; + } + $key = intval( $key ); + if ( $key == 0 ) { + return $this->mItems[0]; + } + + $last = count( $this->mItems ) - 1; + for ( $i = $last + 1; $i <= $key; $i++ ) { + # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff + # "String is not zero-terminated" + wfSuppressWarnings(); + $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . ''; + wfRestoreWarnings(); + } + return $this->mItems[$key]; + } + + function setText( $text ) { + $this->mDefaultKey = $this->addItem( $text ); + } + + function getText() { + return $this->getItem( $this->mDefaultKey ); + } + + function __sleep() { + if ( !isset( $this->mItems[0] ) ) { + // Empty object + $info = false; + } else { + $info = array( + 'base' => $this->mItems[0], + 'diffs' => $this->mDiffs + ); + } + if ( isset( $this->mDefaultKey ) ) { + $info['default'] = $this->mDefaultKey; + } + $this->mCompressed = gzdeflate( serialize( $info ) ); + return array( 'mCompressed' ); + } + + function __wakeup() { + // addItem() doesn't work if mItems is partially filled from mDiffs + $this->mFrozen = true; + $info = unserialize( gzinflate( $this->mCompressed ) ); + unset( $this->mCompressed ); + + if ( !$info ) { + // Empty object + return; + } + + if ( isset( $info['default'] ) ) { + $this->mDefaultKey = $info['default']; + } + $this->mItems[0] = $info['base']; + $this->mDiffs = $info['diffs']; + } +} diff --git a/maintenance/storage/compressOld.inc b/maintenance/storage/compressOld.inc index 52b9c40b2e..d812a95baf 100644 --- a/maintenance/storage/compressOld.inc +++ b/maintenance/storage/compressOld.inc @@ -4,10 +4,6 @@ * @ingroup Maintenance ExternalStorage */ -/** */ -require_once( 'Revision.php' ); -require_once( 'ExternalStoreDB.php' ); - /** @todo document */ function compressOldPages( $start = 0, $extdb = '' ) { $fname = 'compressOldPages'; @@ -229,7 +225,7 @@ function compressWithConcat( $startId, $maxChunkSize, $maxChunkFactor, $factorTh $stub = false; print 'x'; } else { - $stub = $chunk->addItem( $text ); + $stub = new HistoryBlobStub( $chunk->addItem( $text ) ); $stub->setLocation( $primaryOldid ); $stub->setReferrer( $oldid ); print '.'; diff --git a/maintenance/storage/testCompression.php b/maintenance/storage/testCompression.php new file mode 100644 index 0000000000..eaf7e35fa7 --- /dev/null +++ b/maintenance/storage/testCompression.php @@ -0,0 +1,70 @@ +] [--start=] [--limit=] \n"; + exit( 1 ); +} + +$title = Title::newFromText( $args[0] ); +if ( isset( $options['start'] ) ) { + $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) ); + echo "Starting from " . $wgLang->timeanddate( $start ) . "\n"; +} else { + $start = '19700101000000'; +} +$limit = isset( $options['limit'] ) ? $options['limit'] : 10; +$type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob'; + + +$dbr = wfGetDB( DB_SLAVE ); +$res = $dbr->select( + array( 'page', 'revision', 'text' ), + '*', + array( + 'page_namespace' => $title->getNamespace(), + 'page_title' => $title->getDBkey(), + 'page_id=rev_page', + 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ), + 'rev_text_id=old_id' + ), __FILE__, array( 'LIMIT' => $limit ) +); + +$blob = new $type; +$hashes = array(); +$keys = array(); +$uncompressedSize = 0; +$t = -microtime( true ); +foreach ( $res as $row ) { + $revision = new Revision( $row ); + $text = $revision->getText(); + $uncompressedSize += strlen( $text ); + $hashes[$row->rev_id] = md5( $text ); + $keys[$row->rev_id] = $blob->addItem( $text ); +} + +$serialized = serialize( $blob ); +$t += microtime( true ); + +printf( "Compression ratio for %d revisions: %5.2f, %s -> %s\n", + $res->numRows(), + $uncompressedSize / strlen( $serialized ), + $wgLang->formatSize( $uncompressedSize ), + $wgLang->formatSize( strlen( $serialized ) ) +); +printf( "Compression time: %5.2f ms\n", $t * 1000 ); + +$t = -microtime( true ); +$blob = unserialize( $serialized ); +foreach ( $keys as $id => $key ) { + $text = $blob->getItem( $key ); + if ( md5( $text ) != $hashes[$id] ) { + echo "Content hash mismatch for rev_id $id\n"; + #var_dump( $text ); + } +} +$t += microtime( true ); +printf( "Decompression time: %5.2f ms\n", $t * 1000 ); + -- 2.20.1