From: Tim Starling Date: Sat, 30 Oct 2004 14:39:40 +0000 (+0000) Subject: Backporting concatenated gzip history compression from SCHEMA_WORK. Also made a few... X-Git-Tag: 1.5.0alpha1~1413 X-Git-Url: https://git.cyclocoop.org/%242?a=commitdiff_plain;h=7d1442e76f6cf626e746764c9e1737030db6c277;p=lhc%2Fweb%2Fwiklou.git Backporting concatenated gzip history compression from SCHEMA_WORK. Also made a few tweaks to the compression script, mainly to make it faster for the purposes of a testing sequence I'm currently running on it. Will report to wikitech-l once testing is done. --- diff --git a/includes/Article.php b/includes/Article.php index e319524b33..0f18585d0e 100644 --- a/includes/Article.php +++ b/includes/Article.php @@ -91,7 +91,19 @@ class Article { # as pages are saved if $wgCompressRevisions is set. $text = gzinflate( $text ); } - + + if( in_array( 'object', $flags ) ) { + # Generic compressed storage + $obj = unserialize( $text ); + + # Bugger, corrupted my test database by double-serializing + if ( !is_object( $obj ) ) { + $obj = unserialize( $obj ); + } + + $text = $obj->getText(); + } + global $wgLegacyEncoding; if( $wgLegacyEncoding && !in_array( 'utf-8', $flags ) ) { # Old revisions kept around in a legacy encoding? @@ -99,11 +111,6 @@ class Article { global $wgInputEncoding, $wgContLang; $text = $wgContLang->iconv( $wgLegacyEncoding, $wgInputEncoding, $text ); } - - if( in_array( 'link', $flags ) ) { - # Handle link type - $text = Article::followLink( $text ); - } return $text; } @@ -137,131 +144,6 @@ class Article { return implode( ',', $flags ); } - /** - * Returns the text associated with a "link" type old table row - * @static - * @param mixed $link - * @return string $text|false - */ - function followLink( $link ) { - # Split the link into fields and values - $lines = explode( '\n', $link ); - $hash = ''; - $locations = array(); - foreach ( $lines as $line ) { - # Comments - if ( $line{0} == '#' ) { - continue; - } - # Field/value pairs - if ( preg_match( '/^(.*?)\s*:\s*(.*)$/', $line, $matches ) ) { - $field = strtolower($matches[1]); - $value = $matches[2]; - if ( $field == 'hash' ) { - $hash = $value; - } elseif ( $field == 'location' ) { - $locations[] = $value; - } - } - } - - if ( $hash === '' ) { - return false; - } - - # Look in each specified location for the text - $text = false; - foreach ( $locations as $location ) { - $text = Article::fetchFromLocation( $location, $hash ); - if ( $text !== false ) { - break; - } - } - - return $text; - } - - /** - * @static - * @param $location - * @param $hash - */ - function fetchFromLocation( $location, $hash ) { - global $wgLoadBalancer; - $fname = 'fetchFromLocation'; - wfProfileIn( $fname ); - - $p = strpos( $location, ':' ); - if ( $p === false ) { - wfProfileOut( $fname ); - return false; - } - - $type = substr( $location, 0, $p ); - $text = false; - switch ( $type ) { - case 'mysql': - # MySQL locations are specified by mysql:///// - # Machine ID 0 is the current connection - if ( preg_match( '/^mysql:\/\/(\d+)\/([A-Za-z_]+)\/([A-Za-z_]+)\/([A-Za-z_]+)$/', - $location, $matches ) ) { - $machineID = $matches[1]; - $dbName = $matches[2]; - $tblName = $matches[3]; - $index = $matches[4]; - if ( $machineID == 0 ) { - # Current connection - $db =& $this->getDB(); - } else { - # Alternate connection - $db =& $wgLoadBalancer->getConnection( $machineID ); - - if ( array_key_exists( $machineId, $wgKnownMysqlServers ) ) { - # Try to open, return false on failure - $params = $wgKnownDBServers[$machineId]; - $db = Database::newFromParams( $params['server'], $params['user'], $params['password'], - $dbName, 1, DBO_IGNORE ); - } - } - if ( $db->isOpen() ) { - $index = $db->strencode( $index ); - $res = $db->query( "SELECT blob_data FROM $dbName.$tblName " . - "WHERE blob_index='$index' " . $this->getSelectOptions(), $fname ); - $row = $db->fetchObject( $res ); - $text = $row->text_data; - } - } - break; - case 'file': - # File locations are of the form file://, relative to the current directory - if ( preg_match( '/^file:\/\/(.*)$', $location, $matches ) ) - $filename = strstr( $location, 'file://' ); - $text = @file_get_contents( $matches[1] ); - } - if ( $text !== false ) { - # Got text, now we need to interpret it - # The first line contains information about how to do this - $p = strpos( $text, '\n' ); - $type = substr( $text, 0, $p ); - $text = substr( $text, $p + 1 ); - switch ( $type ) { - case 'plain': - break; - case 'gzip': - $text = gzinflate( $text ); - break; - case 'object': - $object = unserialize( $text ); - $text = $object->getItem( $hash ); - break; - default: - $text = false; - } - } - wfProfileOut( $fname ); - return $text; - } - /** * Note that getContent/loadContent may follow redirects if * not told otherwise, and so may cause a change to mTitle. diff --git a/includes/HistoryBlob.php b/includes/HistoryBlob.php index 0c44b34415..ebc47145d4 100644 --- a/includes/HistoryBlob.php +++ b/includes/HistoryBlob.php @@ -10,21 +10,40 @@ */ class HistoryBlob { - function setMeta() {} + # setMeta and getMeta currently aren't used for anything, I just thought they might be useful in the future + # The meta value is a single string + function setMeta( $meta ) {} + + # Gets the meta-value function getMeta() {} + + # Adds an item of text, returns a stub object which points to the item + # You must call setLocation() on the stub object before storing it to the database function addItem() {} - function getItem() {} + + # Get item by hash + function getItem( $hash ) {} + + # Set the "default text" + # This concept is an odd property of the current DB schema, whereby each text item has a revision + # associated with it. The default text is the text of the associated revision. There may, however, + # be other revisions in the same object + function setText() {} + + # Get default text. This is called from Article::getRevisionText() + function getText() {} } /** * The real object * @package MediaWiki */ -class ConcatenatedGzipHistoryBlob +class ConcatenatedGzipHistoryBlob extends HistoryBlob { - /* private */ var $mVersion = 0, $mCompressed = false, $mItems = array(); + /* private */ var $mVersion = 0, $mCompressed = false, $mItems = array(), $mDefaultHash = ''; + /* private */ var $mFast = 0, $mSize = 0; - function HistoryBlob() { + function ConcatenatedGzipHistoryBlob() { if ( !function_exists( 'gzdeflate' ) ) { die( "Need zlib support to read or write this kind of history object (ConcatenatedGzipHistoryBlob)\n" ); } @@ -42,14 +61,28 @@ class ConcatenatedGzipHistoryBlob function addItem( $text ) { $this->uncompress(); - $this->mItems[md5($text)] = $text; + $hash = md5( $text ); + $this->mItems[$hash] = $text; + $this->mSize += strlen( $text ); + + $stub = new HistoryBlobStub( $hash ); + return $stub; } function getItem( $hash ) { - $this->compress(); - return $this->mItems[$hash]; + $this->uncompress(); + if ( array_key_exists( $hash, $this->mItems ) ) { + return $this->mItems[$hash]; + } else { + return false; + } } + function removeItem( $hash ) { + $this->mSize -= strlen( $this->mItems[$hash] ); + unset( $this->mItems[$hash] ); + } + function compress() { if ( !$this->mCompressed ) { $this->mItems = gzdeflate( serialize( $this->mItems ) ); @@ -60,15 +93,81 @@ class ConcatenatedGzipHistoryBlob function uncompress() { if ( $this->mCompressed ) { $this->mItems = unserialize( gzinflate( $this->mItems ) ); + $this->mCompressed = false; } } + function getText() { + $this->uncompress(); + return $this->getItem( $this->mDefaultHash ); + } + + function setText( $text ) { + $this->uncompress(); + $stub = $this->addItem( $text ); + $this->mDefaultHash = $stub->mHash; + } + function __sleep() { - compress(); + $this->compress(); + return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' ); } function __wakeup() { - uncompress(); + $this->uncompress(); + } + + # Determines if this object is happy + function isHappy( $maxFactor, $factorThreshold ) { + if ( count( $this->mItems ) == 0 ) { + return true; + } + if ( $this->mFast ) { + $this->uncompress(); + $record = serialize( $this->mItems ); + $size = strlen( $record ); + $avgUncompressed = $size / count( $this->mItems ); + $compressed = strlen( gzdeflate( $record ) ); + + if ( $compressed < $factorThreshold * 1024 ) { + return true; + } else { + return $avgUncompressed * $maxFactor < $compressed; + } + } else { + return count( $this->mItems ) <= 10; + } + } +} + +class HistoryBlobStub +{ + var $mOldId, $mHash; + + function HistoryBlobStub( $hash = '', $oldid = 0 ) { + $this->mHash = $hash; + } + + # Sets the location (old_id) of the main object to which this object points + function setLocation( $id ) { + $this->mOldId = $id; + } + + function getText() { + $dbr =& wfGetDB( DB_SLAVE ); + $row = $dbr->selectRow( 'old', array( 'old_flags', 'old_text' ), array( 'old_id' => $this->mOldId ) ); + if ( !$row || $row->old_flags != 'object' ) { + return false; + } + $obj = unserialize( $row->old_text ); + if ( !is_object( $obj ) ) { + $obj = unserialize( $obj ); + } + return $obj->getItem( $this->mHash ); + } + + function getHash() { + return $this->mHash; } } ?> diff --git a/includes/Setup.php b/includes/Setup.php index ae39a584f7..c7e62cb263 100644 --- a/includes/Setup.php +++ b/includes/Setup.php @@ -74,6 +74,7 @@ require_once( 'Parser.php' ); require_once( 'ParserCache.php' ); require_once( 'WebRequest.php' ); require_once( 'LoadBalancer.php' ); +require_once( 'HistoryBlob.php' ); $wgRequest = new WebRequest(); diff --git a/maintenance/compressOld.inc b/maintenance/compressOld.inc index c88396b823..d5159baf29 100644 --- a/maintenance/compressOld.inc +++ b/maintenance/compressOld.inc @@ -51,4 +51,153 @@ function compressPage( $row ) { return true; } +define( 'LS_INDIVIDUAL', 0 ); +define( 'LS_CHUNKED', 1 ); + +function compressWithConcat( $startId, $maxChunkSize, $maxChunkFactor, $factorThreshold, $beginDate, $endDate ) +{ + $fname = 'compressWithConcat'; + $loadStyle = LS_CHUNKED; + + $dbw =& wfGetDB( DB_MASTER ); + + # First get a list of all pages + $pageRes = $dbw->select( 'cur', array('cur_namespace', 'cur_title'), false, $fname ); + + # For each of those, get a list of revisions which fit the criteria + $conds = array(); + if ( $beginDate ) { + $conds[] = "old_timestamp>'" . $beginDate . "'"; + } + if ( $endDate ) { + $conds[] = "old_timestamp<'" . $endDate . "'"; + } + if ( $startId ) { + $conds[] = 'old_id>=' . $startId; + } + if ( $loadStyle == LS_CHUNKED ) { + $fields = array( 'old_id', 'old_flags', 'old_text' ); + $revLoadOptions = 'FOR UPDATE'; + } else { + $fields = array( 'old_id' ); + $revLoadOptions = array(); + } + + while ( $pageRow = $dbw->fetchObject( $pageRes ) ) { + # Display progress + $titleObj = Title::makeTitle( $pageRow->cur_namespace, $pageRow->cur_title ); + print $titleObj->getPrefixedDBkey() . " "; + + # Load revisions + $revRes = $dbw->select( 'old', $fields, + array( 'old_namespace' => $pageRow->cur_namespace, 'old_title' => $pageRow->cur_title ) + $conds, + $fname, + $revLoadOptions + ); + $revs = array(); + while ( $revRow = $dbw->fetchObject( $revRes ) ) { + $revs[] = $revRow; + } + + if ( count( $revs ) < 2) { + # No revisions matching, no further processing + print "\n"; + continue; + } + + # For each chunk + $i = 0; + while ( $i < count( $revs ) ) { + if ( $i < count( $revs ) - $maxChunkSize ) { + $thisChunkSize = $maxChunkSize; + } else { + $thisChunkSize = count( $revs ) - $i; + } + + $chunk = new ConcatenatedGzipHistoryBlob(); + $stubs = array(); + $dbw->begin(); + $usedChunk = false; + $primaryOldid = $revs[$i]->old_id; + + # Get the text of each revision and add it to the object + for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy( $maxChunkFactor, $factorThreshold ); $j++ ) { + $oldid = $revs[$i + $j]->old_id; + + # Get text + if ( $loadStyle == LS_INDIVIDUAL ) { + $textRow = $dbw->selectRow( 'old', + array( 'old_flags', 'old_text' ), + array( 'old_id' => $oldid ), + $fname, + 'FOR UPDATE' + ); + $text = Article::getRevisionText( $textRow ); + } else { + $text = Article::getRevisionText( $revs[$i + $j] ); + } + + if ( $text === false ) { + print "\nError, unable to get text in old_id $oldid\n"; + #$dbw->delete( 'old', array( 'old_id' => $oldid ) ); + } + + if ( $j == 0 ) { + $chunk->setText( $text ); + print '.'; + } else { + # Don't make a stub if it's going to be longer than the article + # Stubs are typically about 100 bytes + if ( strlen( $text ) < 120 ) { + $stub = false; + print 'x'; + } else { + $stub = $chunk->addItem( $text ); + $stub->setLocation( $primaryOldid ); + $hash = $stub->getHash(); + $stub = serialize( $stub ); + print '.'; + $usedChunk = true; + } + $stubs[$j] = $stub; + } + } + $thisChunkSize = $j; + + # If we couldn't actually use any stubs because the pages were too small, do nothing + if ( $usedChunk ) { + # Store the main object + $dbw->update( 'old', + array( /* SET */ + 'old_text' => serialize( $chunk ), + 'old_flags' => 'object', + ), array( /* WHERE */ + 'old_id' => $primaryOldid + ) + ); + + # Store the stub objects + for ( $j = 1; $j < $thisChunkSize; $j++ ) { + # Skip if not compressing + if ( $stubs[$j] !== false ) { + $dbw->update( 'old', + array( /* SET */ + 'old_text' => $stubs[$j], + 'old_flags' => 'object', + ), array( /* WHERE */ + 'old_id' => $revs[$i + $j]->old_id + ) + ); + } + } + } + # Done, next + print "/"; + $dbw->commit(); + $i += $thisChunkSize; + } + print "\n"; + } + return true; +} ?> diff --git a/maintenance/compressOld.php b/maintenance/compressOld.php index d1f3f066c6..865fcebd9c 100644 --- a/maintenance/compressOld.php +++ b/maintenance/compressOld.php @@ -7,6 +7,33 @@ */ /** */ + +/** + * Usage: + * + * Non-wikimedia + * php compressOld.php [-t ] [-c ] [-b ] [-e ] [-s ] + * + * Wikimedia + * php compressOld.php [-t ] [-c ] [-b ] [-e ] [-s ] + * [-f ] [-h ] + * + * is either: + * gzip: compress revisions independently + * concat: concatenate revisions and compress in chunks (default) + * + * is the old_id to start from + * + * The following options apply only to the concat type: + * is the earliest date to check for uncompressed revisions + * is the latest revision date to compress + * is the maximum number of revisions in a concat chunk + * is the maximum ratio of compressed chunk bytes to uncompressed avg. revision bytes + * is a minimum number of KB, where cuts in + * + */ + +$optionsWithArgs = array( 't', 'c', 's', 'f', 'h' ); require_once( "commandLine.inc" ); require_once( "compressOld.inc" ); @@ -16,19 +43,39 @@ if( !function_exists( "gzdeflate" ) ) { die(); } +$defaults = array( + 't' => 'concat', + 'c' => 20, + 's' => 0, + 'f' => 3, + 'h' => 100, + 'b' => '', + 'e' => '', +); + +$args = $args + $defaults; + +if ( $args['t'] != 'concat' && $args['t'] != 'gzip' ) { + print "Type \"{$args['t']}\" not supported\n"; +} + print "Depending on the size of your database this may take a while!\n"; print "If you abort the script while it's running it shouldn't harm anything,\n"; print "but if you haven't backed up your data, you SHOULD abort now!\n\n"; print "Press control-c to abort first (will proceed automatically in 5 seconds)\n"; -sleep(5); +#sleep(5); + +$success = true; +if ( $args['t'] == 'concat' ) { + $success = compressWithConcat( $args['s'], $args['c'], $args['f'], $args['h'], $args['b'], $args['e'] ); +} else { + compressOldPages( $args['s'] ); +} -$n = 0; -if( !empty( $argv[1] ) ) { - $n = intval( $argv[1] ); +if ( $success ) { + print "Done.\n"; } -compressOldPages( $n ); -print "Done.\n"; exit(); ?>