# as pages are saved if $wgCompressRevisions is set.
$text = gzinflate( $text );
}
-
+
+ if( in_array( 'object', $flags ) ) {
+ # Generic compressed storage
+ $obj = unserialize( $text );
+
+ # Bugger, corrupted my test database by double-serializing
+ if ( !is_object( $obj ) ) {
+ $obj = unserialize( $obj );
+ }
+
+ $text = $obj->getText();
+ }
+
global $wgLegacyEncoding;
if( $wgLegacyEncoding && !in_array( 'utf-8', $flags ) ) {
# Old revisions kept around in a legacy encoding?
global $wgInputEncoding, $wgContLang;
$text = $wgContLang->iconv( $wgLegacyEncoding, $wgInputEncoding, $text );
}
-
- if( in_array( 'link', $flags ) ) {
- # Handle link type
- $text = Article::followLink( $text );
- }
return $text;
}
return implode( ',', $flags );
}
- /**
- * Returns the text associated with a "link" type old table row
- * @static
- * @param mixed $link
- * @return string $text|false
- */
- function followLink( $link ) {
- # Split the link into fields and values
- $lines = explode( '\n', $link );
- $hash = '';
- $locations = array();
- foreach ( $lines as $line ) {
- # Comments
- if ( $line{0} == '#' ) {
- continue;
- }
- # Field/value pairs
- if ( preg_match( '/^(.*?)\s*:\s*(.*)$/', $line, $matches ) ) {
- $field = strtolower($matches[1]);
- $value = $matches[2];
- if ( $field == 'hash' ) {
- $hash = $value;
- } elseif ( $field == 'location' ) {
- $locations[] = $value;
- }
- }
- }
-
- if ( $hash === '' ) {
- return false;
- }
-
- # Look in each specified location for the text
- $text = false;
- foreach ( $locations as $location ) {
- $text = Article::fetchFromLocation( $location, $hash );
- if ( $text !== false ) {
- break;
- }
- }
-
- return $text;
- }
-
- /**
- * @static
- * @param $location
- * @param $hash
- */
- function fetchFromLocation( $location, $hash ) {
- global $wgLoadBalancer;
- $fname = 'fetchFromLocation';
- wfProfileIn( $fname );
-
- $p = strpos( $location, ':' );
- if ( $p === false ) {
- wfProfileOut( $fname );
- return false;
- }
-
- $type = substr( $location, 0, $p );
- $text = false;
- switch ( $type ) {
- case 'mysql':
- # MySQL locations are specified by mysql://<machineID>/<dbname>/<tblname>/<index>
- # Machine ID 0 is the current connection
- if ( preg_match( '/^mysql:\/\/(\d+)\/([A-Za-z_]+)\/([A-Za-z_]+)\/([A-Za-z_]+)$/',
- $location, $matches ) ) {
- $machineID = $matches[1];
- $dbName = $matches[2];
- $tblName = $matches[3];
- $index = $matches[4];
- if ( $machineID == 0 ) {
- # Current connection
- $db =& $this->getDB();
- } else {
- # Alternate connection
- $db =& $wgLoadBalancer->getConnection( $machineID );
-
- if ( array_key_exists( $machineId, $wgKnownMysqlServers ) ) {
- # Try to open, return false on failure
- $params = $wgKnownDBServers[$machineId];
- $db = Database::newFromParams( $params['server'], $params['user'], $params['password'],
- $dbName, 1, DBO_IGNORE );
- }
- }
- if ( $db->isOpen() ) {
- $index = $db->strencode( $index );
- $res = $db->query( "SELECT blob_data FROM $dbName.$tblName " .
- "WHERE blob_index='$index' " . $this->getSelectOptions(), $fname );
- $row = $db->fetchObject( $res );
- $text = $row->text_data;
- }
- }
- break;
- case 'file':
- # File locations are of the form file://<filename>, relative to the current directory
- if ( preg_match( '/^file:\/\/(.*)$', $location, $matches ) )
- $filename = strstr( $location, 'file://' );
- $text = @file_get_contents( $matches[1] );
- }
- if ( $text !== false ) {
- # Got text, now we need to interpret it
- # The first line contains information about how to do this
- $p = strpos( $text, '\n' );
- $type = substr( $text, 0, $p );
- $text = substr( $text, $p + 1 );
- switch ( $type ) {
- case 'plain':
- break;
- case 'gzip':
- $text = gzinflate( $text );
- break;
- case 'object':
- $object = unserialize( $text );
- $text = $object->getItem( $hash );
- break;
- default:
- $text = false;
- }
- }
- wfProfileOut( $fname );
- return $text;
- }
-
/**
* Note that getContent/loadContent may follow redirects if
* not told otherwise, and so may cause a change to mTitle.
*/
class HistoryBlob
{
- function setMeta() {}
+ # setMeta and getMeta currently aren't used for anything, I just thought they might be useful in the future
+ # The meta value is a single string
+ function setMeta( $meta ) {}
+
+ # Gets the meta-value
function getMeta() {}
+
+ # Adds an item of text, returns a stub object which points to the item
+ # You must call setLocation() on the stub object before storing it to the database
function addItem() {}
- function getItem() {}
+
+ # Get item by hash
+ function getItem( $hash ) {}
+
+ # Set the "default text"
+ # This concept is an odd property of the current DB schema, whereby each text item has a revision
+ # associated with it. The default text is the text of the associated revision. There may, however,
+ # be other revisions in the same object
+ function setText() {}
+
+ # Get default text. This is called from Article::getRevisionText()
+ function getText() {}
}
/**
* The real object
* @package MediaWiki
*/
-class ConcatenatedGzipHistoryBlob
+class ConcatenatedGzipHistoryBlob extends HistoryBlob
{
- /* private */ var $mVersion = 0, $mCompressed = false, $mItems = array();
+ /* private */ var $mVersion = 0, $mCompressed = false, $mItems = array(), $mDefaultHash = '';
+ /* private */ var $mFast = 0, $mSize = 0;
- function HistoryBlob() {
+ function ConcatenatedGzipHistoryBlob() {
if ( !function_exists( 'gzdeflate' ) ) {
die( "Need zlib support to read or write this kind of history object (ConcatenatedGzipHistoryBlob)\n" );
}
function addItem( $text ) {
$this->uncompress();
- $this->mItems[md5($text)] = $text;
+ $hash = md5( $text );
+ $this->mItems[$hash] = $text;
+ $this->mSize += strlen( $text );
+
+ $stub = new HistoryBlobStub( $hash );
+ return $stub;
}
function getItem( $hash ) {
- $this->compress();
- return $this->mItems[$hash];
+ $this->uncompress();
+ if ( array_key_exists( $hash, $this->mItems ) ) {
+ return $this->mItems[$hash];
+ } else {
+ return false;
+ }
}
+ function removeItem( $hash ) {
+ $this->mSize -= strlen( $this->mItems[$hash] );
+ unset( $this->mItems[$hash] );
+ }
+
function compress() {
if ( !$this->mCompressed ) {
$this->mItems = gzdeflate( serialize( $this->mItems ) );
function uncompress() {
if ( $this->mCompressed ) {
$this->mItems = unserialize( gzinflate( $this->mItems ) );
+ $this->mCompressed = false;
}
}
+ function getText() {
+ $this->uncompress();
+ return $this->getItem( $this->mDefaultHash );
+ }
+
+ function setText( $text ) {
+ $this->uncompress();
+ $stub = $this->addItem( $text );
+ $this->mDefaultHash = $stub->mHash;
+ }
+
function __sleep() {
- compress();
+ $this->compress();
+ return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
}
function __wakeup() {
- uncompress();
+ $this->uncompress();
+ }
+
+ # Determines if this object is happy
+ function isHappy( $maxFactor, $factorThreshold ) {
+ if ( count( $this->mItems ) == 0 ) {
+ return true;
+ }
+ if ( $this->mFast ) {
+ $this->uncompress();
+ $record = serialize( $this->mItems );
+ $size = strlen( $record );
+ $avgUncompressed = $size / count( $this->mItems );
+ $compressed = strlen( gzdeflate( $record ) );
+
+ if ( $compressed < $factorThreshold * 1024 ) {
+ return true;
+ } else {
+ return $avgUncompressed * $maxFactor < $compressed;
+ }
+ } else {
+ return count( $this->mItems ) <= 10;
+ }
+ }
+}
+
+class HistoryBlobStub
+{
+ var $mOldId, $mHash;
+
+ function HistoryBlobStub( $hash = '', $oldid = 0 ) {
+ $this->mHash = $hash;
+ }
+
+ # Sets the location (old_id) of the main object to which this object points
+ function setLocation( $id ) {
+ $this->mOldId = $id;
+ }
+
+ function getText() {
+ $dbr =& wfGetDB( DB_SLAVE );
+ $row = $dbr->selectRow( 'old', array( 'old_flags', 'old_text' ), array( 'old_id' => $this->mOldId ) );
+ if ( !$row || $row->old_flags != 'object' ) {
+ return false;
+ }
+ $obj = unserialize( $row->old_text );
+ if ( !is_object( $obj ) ) {
+ $obj = unserialize( $obj );
+ }
+ return $obj->getItem( $this->mHash );
+ }
+
+ function getHash() {
+ return $this->mHash;
}
}
?>
require_once( 'ParserCache.php' );
require_once( 'WebRequest.php' );
require_once( 'LoadBalancer.php' );
+require_once( 'HistoryBlob.php' );
$wgRequest = new WebRequest();
return true;
}
+define( 'LS_INDIVIDUAL', 0 );
+define( 'LS_CHUNKED', 1 );
+
+function compressWithConcat( $startId, $maxChunkSize, $maxChunkFactor, $factorThreshold, $beginDate, $endDate )
+{
+ $fname = 'compressWithConcat';
+ $loadStyle = LS_CHUNKED;
+
+ $dbw =& wfGetDB( DB_MASTER );
+
+ # First get a list of all pages
+ $pageRes = $dbw->select( 'cur', array('cur_namespace', 'cur_title'), false, $fname );
+
+ # For each of those, get a list of revisions which fit the criteria
+ $conds = array();
+ if ( $beginDate ) {
+ $conds[] = "old_timestamp>'" . $beginDate . "'";
+ }
+ if ( $endDate ) {
+ $conds[] = "old_timestamp<'" . $endDate . "'";
+ }
+ if ( $startId ) {
+ $conds[] = 'old_id>=' . $startId;
+ }
+ if ( $loadStyle == LS_CHUNKED ) {
+ $fields = array( 'old_id', 'old_flags', 'old_text' );
+ $revLoadOptions = 'FOR UPDATE';
+ } else {
+ $fields = array( 'old_id' );
+ $revLoadOptions = array();
+ }
+
+ while ( $pageRow = $dbw->fetchObject( $pageRes ) ) {
+ # Display progress
+ $titleObj = Title::makeTitle( $pageRow->cur_namespace, $pageRow->cur_title );
+ print $titleObj->getPrefixedDBkey() . " ";
+
+ # Load revisions
+ $revRes = $dbw->select( 'old', $fields,
+ array( 'old_namespace' => $pageRow->cur_namespace, 'old_title' => $pageRow->cur_title ) + $conds,
+ $fname,
+ $revLoadOptions
+ );
+ $revs = array();
+ while ( $revRow = $dbw->fetchObject( $revRes ) ) {
+ $revs[] = $revRow;
+ }
+
+ if ( count( $revs ) < 2) {
+ # No revisions matching, no further processing
+ print "\n";
+ continue;
+ }
+
+ # For each chunk
+ $i = 0;
+ while ( $i < count( $revs ) ) {
+ if ( $i < count( $revs ) - $maxChunkSize ) {
+ $thisChunkSize = $maxChunkSize;
+ } else {
+ $thisChunkSize = count( $revs ) - $i;
+ }
+
+ $chunk = new ConcatenatedGzipHistoryBlob();
+ $stubs = array();
+ $dbw->begin();
+ $usedChunk = false;
+ $primaryOldid = $revs[$i]->old_id;
+
+ # Get the text of each revision and add it to the object
+ for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy( $maxChunkFactor, $factorThreshold ); $j++ ) {
+ $oldid = $revs[$i + $j]->old_id;
+
+ # Get text
+ if ( $loadStyle == LS_INDIVIDUAL ) {
+ $textRow = $dbw->selectRow( 'old',
+ array( 'old_flags', 'old_text' ),
+ array( 'old_id' => $oldid ),
+ $fname,
+ 'FOR UPDATE'
+ );
+ $text = Article::getRevisionText( $textRow );
+ } else {
+ $text = Article::getRevisionText( $revs[$i + $j] );
+ }
+
+ if ( $text === false ) {
+ print "\nError, unable to get text in old_id $oldid\n";
+ #$dbw->delete( 'old', array( 'old_id' => $oldid ) );
+ }
+
+ if ( $j == 0 ) {
+ $chunk->setText( $text );
+ print '.';
+ } else {
+ # Don't make a stub if it's going to be longer than the article
+ # Stubs are typically about 100 bytes
+ if ( strlen( $text ) < 120 ) {
+ $stub = false;
+ print 'x';
+ } else {
+ $stub = $chunk->addItem( $text );
+ $stub->setLocation( $primaryOldid );
+ $hash = $stub->getHash();
+ $stub = serialize( $stub );
+ print '.';
+ $usedChunk = true;
+ }
+ $stubs[$j] = $stub;
+ }
+ }
+ $thisChunkSize = $j;
+
+ # If we couldn't actually use any stubs because the pages were too small, do nothing
+ if ( $usedChunk ) {
+ # Store the main object
+ $dbw->update( 'old',
+ array( /* SET */
+ 'old_text' => serialize( $chunk ),
+ 'old_flags' => 'object',
+ ), array( /* WHERE */
+ 'old_id' => $primaryOldid
+ )
+ );
+
+ # Store the stub objects
+ for ( $j = 1; $j < $thisChunkSize; $j++ ) {
+ # Skip if not compressing
+ if ( $stubs[$j] !== false ) {
+ $dbw->update( 'old',
+ array( /* SET */
+ 'old_text' => $stubs[$j],
+ 'old_flags' => 'object',
+ ), array( /* WHERE */
+ 'old_id' => $revs[$i + $j]->old_id
+ )
+ );
+ }
+ }
+ }
+ # Done, next
+ print "/";
+ $dbw->commit();
+ $i += $thisChunkSize;
+ }
+ print "\n";
+ }
+ return true;
+}
?>
*/
/** */
+
+/**
+ * Usage:
+ *
+ * Non-wikimedia
+ * php compressOld.php [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
+ *
+ * Wikimedia
+ * php compressOld.php <database> [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
+ * [-f <max-factor>] [-h <factor-threshold>]
+ *
+ * <type> is either:
+ * gzip: compress revisions independently
+ * concat: concatenate revisions and compress in chunks (default)
+ *
+ * <start-id> is the old_id to start from
+ *
+ * The following options apply only to the concat type:
+ * <begin-date> is the earliest date to check for uncompressed revisions
+ * <end-date> is the latest revision date to compress
+ * <chunk-size> is the maximum number of revisions in a concat chunk
+ * <max-factor> is the maximum ratio of compressed chunk bytes to uncompressed avg. revision bytes
+ * <factor-threshold> is a minimum number of KB, where <max-factor> cuts in
+ *
+ */
+
+$optionsWithArgs = array( 't', 'c', 's', 'f', 'h' );
require_once( "commandLine.inc" );
require_once( "compressOld.inc" );
die();
}
+$defaults = array(
+ 't' => 'concat',
+ 'c' => 20,
+ 's' => 0,
+ 'f' => 3,
+ 'h' => 100,
+ 'b' => '',
+ 'e' => '',
+);
+
+$args = $args + $defaults;
+
+if ( $args['t'] != 'concat' && $args['t'] != 'gzip' ) {
+ print "Type \"{$args['t']}\" not supported\n";
+}
+
print "Depending on the size of your database this may take a while!\n";
print "If you abort the script while it's running it shouldn't harm anything,\n";
print "but if you haven't backed up your data, you SHOULD abort now!\n\n";
print "Press control-c to abort first (will proceed automatically in 5 seconds)\n";
-sleep(5);
+#sleep(5);
+
+$success = true;
+if ( $args['t'] == 'concat' ) {
+ $success = compressWithConcat( $args['s'], $args['c'], $args['f'], $args['h'], $args['b'], $args['e'] );
+} else {
+ compressOldPages( $args['s'] );
+}
-$n = 0;
-if( !empty( $argv[1] ) ) {
- $n = intval( $argv[1] );
+if ( $success ) {
+ print "Done.\n";
}
-compressOldPages( $n );
-print "Done.\n";
exit();
?>