<?php
/**
- * Pure virtual parent
- * @todo document (needs a one-sentence top-level class description, that answers the question: "what is a HistoryBlob?")
+ * Base class for general text storage via the "object" flag in old_flags, or
+ * two-part external storage URLs. Used for represent efficient concatenated
+ * storage, and migration-related pointer objects.
*/
interface HistoryBlob
{
- /**
- * setMeta and getMeta currently aren't used for anything, I just thought
- * they might be useful in the future.
- * @param $meta String: a single string.
- */
- public function setMeta( $meta );
-
- /**
- * setMeta and getMeta currently aren't used for anything, I just thought
- * they might be useful in the future.
- * Gets the meta-value
- */
- public function getMeta();
-
/**
* Adds an item of text, returns a stub object which points to the item.
* You must call setLocation() on the stub object before storing it to the
* database
+ * Returns the key for getItem()
*/
public function addItem( $text );
/**
- * Get item by hash
+ * Get item by key, or false if the key is not present
*/
- public function getItem( $hash );
+ public function getItem( $key );
- # Set the "default text"
- # This concept is an odd property of the current DB schema, whereby each text item has a revision
- # associated with it. The default text is the text of the associated revision. There may, however,
- # be other revisions in the same object
+ /**
+ * Set the "default text"
+ * This concept is an odd property of the current DB schema, whereby each text item has a revision
+ * associated with it. The default text is the text of the associated revision. There may, however,
+ * be other revisions in the same object.
+ *
+ * Default text is not required for two-part external storage URLs.
+ */
public function setText( $text );
/**
}
/**
- * The real object
- * @todo document (needs one-sentence top-level class description + function descriptions).
+ * Concatenated gzip (CGZ) storage
+ * Improves compression ratio by concatenating like objects before gzipping
*/
class ConcatenatedGzipHistoryBlob implements HistoryBlob
{
}
}
- #
- # HistoryBlob implementation:
- #
-
- /** @todo document */
- public function setMeta( $metaData ) {
- $this->uncompress();
- $this->mItems['meta'] = $metaData;
- }
-
- /** @todo document */
- public function getMeta() {
- $this->uncompress();
- return $this->mItems['meta'];
- }
-
- /** @todo document */
public function addItem( $text ) {
$this->uncompress();
$hash = md5( $text );
$this->mItems[$hash] = $text;
$this->mSize += strlen( $text );
- $stub = new HistoryBlobStub( $hash );
- return $stub;
+ return $hash;
}
- /** @todo document */
public function getItem( $hash ) {
$this->uncompress();
if ( array_key_exists( $hash, $this->mItems ) ) {
}
}
- /** @todo document */
public function setText( $text ) {
$this->uncompress();
- $stub = $this->addItem( $text );
- $this->mDefaultHash = $stub->mHash;
+ $this->mDefaultHash = $this->addItem( $text );
}
- /** @todo document */
public function getText() {
$this->uncompress();
return $this->getItem( $this->mDefaultHash );
}
- # HistoryBlob implemented.
-
-
- /** @todo document */
+ /**
+ * Remove an item
+ */
public function removeItem( $hash ) {
$this->mSize -= strlen( $this->mItems[$hash] );
unset( $this->mItems[$hash] );
}
- /** @todo document */
+ /**
+ * Compress the bulk data in the object
+ */
public function compress() {
if ( !$this->mCompressed ) {
$this->mItems = gzdeflate( serialize( $this->mItems ) );
}
}
- /** @todo document */
+ /**
+ * Uncompress bulk data
+ */
public function uncompress() {
if ( $this->mCompressed ) {
$this->mItems = unserialize( gzinflate( $this->mItems ) );
}
- /** @todo document */
function __sleep() {
$this->compress();
return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
}
- /** @todo document */
function __wakeup() {
$this->uncompress();
}
/**
- * Determines if this object is happy
+ * Helper function for compression jobs
+ * Returns true until the object is "full" and ready to be committed
*/
public function isHappy( $maxFactor, $factorThreshold ) {
if ( count( $this->mItems ) == 0 ) {
/**
- * @todo document (needs one-sentence top-level class description + some function descriptions).
+ * Pointer object for an item within a CGZ blob stored in the text table.
*/
class HistoryBlobStub {
var $mOldId, $mHash, $mRef;
- /** @todo document */
+ /**
+ * @param string $hash The content hash of the text
+ * @param integer $oldid The old_id for the CGZ object
+ */
function HistoryBlobStub( $hash = '', $oldid = 0 ) {
$this->mHash = $hash;
}
return $this->mRef;
}
- /** @todo document */
function getText() {
$fname = 'HistoryBlobStub::getText';
global $wgBlobCache;
return $obj->getItem( $this->mHash );
}
- /** @todo document */
+ /**
+ * Get the content hash
+ */
function getHash() {
return $this->mHash;
}
class HistoryBlobCurStub {
var $mCurId;
- /** @todo document */
+ /**
+ * @param integer $curid The cur_id pointed to
+ */
function HistoryBlobCurStub( $curid = 0 ) {
$this->mCurId = $curid;
}
$this->mCurId = $id;
}
- /** @todo document */
function getText() {
$dbr = wfGetDB( DB_SLAVE );
$row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) );
return $row->cur_text;
}
}
+
+/**
+ * Diff-based history compression
+ * Requires xdiff 1.5+ and zlib
+ */
+class DiffHistoryBlob implements HistoryBlob {
+ /** Uncompressed item cache */
+ var $mItems = array();
+
+ /**
+ * Array of diffs, where $this->mDiffs[0] is the diff between
+ * $this->mDiffs[0] and $this->mDiffs[1]
+ */
+ var $mDiffs = array();
+
+ /**
+ * The key for getText()
+ */
+ var $mDefaultKey;
+
+ /**
+ * Compressed storage
+ */
+ var $mCompressed;
+
+ /**
+ * True if the object is locked against further writes
+ */
+ var $mFrozen = false;
+
+
+ function __construct() {
+ if ( !function_exists( 'xdiff_string_bdiff' ) ){
+ throw new MWException( "Need xdiff 1.5+ support to read or write DiffHistoryBlob\n" );
+ }
+ if ( !function_exists( 'gzdeflate' ) ) {
+ throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
+ }
+ }
+
+ function addItem( $text ) {
+ if ( $this->mFrozen ) {
+ throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" );
+ }
+
+ $this->mItems[] = $text;
+ $i = count( $this->mItems ) - 1;
+ if ( $i > 0 ) {
+ # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
+ # "String is not zero-terminated"
+ wfSuppressWarnings();
+ $this->mDiffs[] = xdiff_string_bdiff( $this->mItems[$i-1], $text ) . '';
+ wfRestoreWarnings();
+ }
+ return $i;
+ }
+
+ function getItem( $key ) {
+ if ( $key > count( $this->mDiffs ) + 1 ) {
+ return false;
+ }
+ $key = intval( $key );
+ if ( $key == 0 ) {
+ return $this->mItems[0];
+ }
+
+ $last = count( $this->mItems ) - 1;
+ for ( $i = $last + 1; $i <= $key; $i++ ) {
+ # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
+ # "String is not zero-terminated"
+ wfSuppressWarnings();
+ $this->mItems[$i] = xdiff_string_bpatch( $this->mItems[$i - 1], $this->mDiffs[$i - 1] ) . '';
+ wfRestoreWarnings();
+ }
+ return $this->mItems[$key];
+ }
+
+ function setText( $text ) {
+ $this->mDefaultKey = $this->addItem( $text );
+ }
+
+ function getText() {
+ return $this->getItem( $this->mDefaultKey );
+ }
+
+ function __sleep() {
+ if ( !isset( $this->mItems[0] ) ) {
+ // Empty object
+ $info = false;
+ } else {
+ $info = array(
+ 'base' => $this->mItems[0],
+ 'diffs' => $this->mDiffs
+ );
+ }
+ if ( isset( $this->mDefaultKey ) ) {
+ $info['default'] = $this->mDefaultKey;
+ }
+ $this->mCompressed = gzdeflate( serialize( $info ) );
+ return array( 'mCompressed' );
+ }
+
+ function __wakeup() {
+ // addItem() doesn't work if mItems is partially filled from mDiffs
+ $this->mFrozen = true;
+ $info = unserialize( gzinflate( $this->mCompressed ) );
+ unset( $this->mCompressed );
+
+ if ( !$info ) {
+ // Empty object
+ return;
+ }
+
+ if ( isset( $info['default'] ) ) {
+ $this->mDefaultKey = $info['default'];
+ }
+ $this->mItems[0] = $info['base'];
+ $this->mDiffs = $info['diffs'];
+ }
+}
--- /dev/null
+<?php
+
+$optionsWithArgs = array( 'start', 'limit', 'type' );
+require( dirname(__FILE__).'/../commandLine.inc' );
+
+if ( !isset( $args[0] ) ) {
+ echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n";
+ exit( 1 );
+}
+
+$title = Title::newFromText( $args[0] );
+if ( isset( $options['start'] ) ) {
+ $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
+ echo "Starting from " . $wgLang->timeanddate( $start ) . "\n";
+} else {
+ $start = '19700101000000';
+}
+$limit = isset( $options['limit'] ) ? $options['limit'] : 10;
+$type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob';
+
+
+$dbr = wfGetDB( DB_SLAVE );
+$res = $dbr->select(
+ array( 'page', 'revision', 'text' ),
+ '*',
+ array(
+ 'page_namespace' => $title->getNamespace(),
+ 'page_title' => $title->getDBkey(),
+ 'page_id=rev_page',
+ 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
+ 'rev_text_id=old_id'
+ ), __FILE__, array( 'LIMIT' => $limit )
+);
+
+$blob = new $type;
+$hashes = array();
+$keys = array();
+$uncompressedSize = 0;
+$t = -microtime( true );
+foreach ( $res as $row ) {
+ $revision = new Revision( $row );
+ $text = $revision->getText();
+ $uncompressedSize += strlen( $text );
+ $hashes[$row->rev_id] = md5( $text );
+ $keys[$row->rev_id] = $blob->addItem( $text );
+}
+
+$serialized = serialize( $blob );
+$t += microtime( true );
+
+printf( "Compression ratio for %d revisions: %5.2f, %s -> %s\n",
+ $res->numRows(),
+ $uncompressedSize / strlen( $serialized ),
+ $wgLang->formatSize( $uncompressedSize ),
+ $wgLang->formatSize( strlen( $serialized ) )
+);
+printf( "Compression time: %5.2f ms\n", $t * 1000 );
+
+$t = -microtime( true );
+$blob = unserialize( $serialized );
+foreach ( $keys as $id => $key ) {
+ $text = $blob->getItem( $key );
+ if ( md5( $text ) != $hashes[$id] ) {
+ echo "Content hash mismatch for rev_id $id\n";
+ #var_dump( $text );
+ }
+}
+$t += microtime( true );
+printf( "Decompression time: %5.2f ms\n", $t * 1000 );
+