Backporting concatenated gzip history compression from SCHEMA_WORK. Also made a few...
authorTim Starling <tstarling@users.mediawiki.org>
Sat, 30 Oct 2004 14:39:40 +0000 (14:39 +0000)
committerTim Starling <tstarling@users.mediawiki.org>
Sat, 30 Oct 2004 14:39:40 +0000 (14:39 +0000)
includes/Article.php
includes/HistoryBlob.php
includes/Setup.php
maintenance/compressOld.inc
maintenance/compressOld.php

index e319524..0f18585 100644 (file)
@@ -91,7 +91,19 @@ class Article {
                        # as pages are saved if $wgCompressRevisions is set.
                        $text = gzinflate( $text );
                }
-               
+                       
+               if( in_array( 'object', $flags ) ) {
+                       # Generic compressed storage
+                       $obj = unserialize( $text );
+
+                       # Bugger, corrupted my test database by double-serializing
+                       if ( !is_object( $obj ) ) {
+                               $obj = unserialize( $obj );
+                       }
+
+                       $text = $obj->getText();
+               }
+       
                global $wgLegacyEncoding;
                if( $wgLegacyEncoding && !in_array( 'utf-8', $flags ) ) {
                        # Old revisions kept around in a legacy encoding?
@@ -99,11 +111,6 @@ class Article {
                        global $wgInputEncoding, $wgContLang;
                        $text = $wgContLang->iconv( $wgLegacyEncoding, $wgInputEncoding, $text );
                }
-               
-               if( in_array( 'link', $flags ) ) {
-                       # Handle link type
-                       $text = Article::followLink( $text );
-               }
                return $text;
        }
 
@@ -137,131 +144,6 @@ class Article {
                return implode( ',', $flags );
        }
 
-       /**
-        * Returns the text associated with a "link" type old table row
-        * @static
-        * @param mixed $link
-        * @return string $text|false
-        */
-       function followLink( $link ) {
-               # Split the link into fields and values
-               $lines = explode( '\n', $link );
-               $hash = '';
-               $locations = array();
-               foreach ( $lines as $line ) {
-                       # Comments
-                       if ( $line{0} == '#' ) {
-                               continue;
-                       }
-                       # Field/value pairs
-                       if ( preg_match( '/^(.*?)\s*:\s*(.*)$/', $line, $matches ) ) {
-                               $field = strtolower($matches[1]);
-                               $value = $matches[2];
-                               if ( $field == 'hash' ) {
-                                       $hash = $value;
-                               } elseif ( $field == 'location' ) {
-                                       $locations[] = $value;
-                               }
-                       }
-               }
-
-               if ( $hash === '' ) {
-                       return false;
-               }
-
-               # Look in each specified location for the text
-               $text = false;
-               foreach ( $locations as $location ) {
-                       $text = Article::fetchFromLocation( $location, $hash );
-                       if ( $text !== false ) {
-                               break;
-                       }
-               }
-
-               return $text;
-       }
-
-       /**
-        * @static
-        * @param $location
-        * @param $hash
-        */
-       function fetchFromLocation( $location, $hash ) {
-               global $wgLoadBalancer;
-               $fname = 'fetchFromLocation';
-               wfProfileIn( $fname );
-
-               $p = strpos( $location, ':' );
-               if ( $p === false ) {
-                       wfProfileOut( $fname );
-                       return false;
-               }
-
-               $type = substr( $location, 0, $p );
-               $text = false;
-               switch ( $type ) {
-                       case 'mysql':
-                               # MySQL locations are specified by mysql://<machineID>/<dbname>/<tblname>/<index>
-                               # Machine ID 0 is the current connection
-                               if ( preg_match( '/^mysql:\/\/(\d+)\/([A-Za-z_]+)\/([A-Za-z_]+)\/([A-Za-z_]+)$/',
-                                 $location, $matches ) ) {
-                                       $machineID = $matches[1];
-                                       $dbName = $matches[2];
-                                       $tblName = $matches[3];
-                                       $index = $matches[4];
-                                       if ( $machineID == 0 ) {
-                                               # Current connection
-                                               $db =& $this->getDB();
-                                       } else {
-                                               # Alternate connection
-                                               $db =& $wgLoadBalancer->getConnection( $machineID );
-
-                                               if ( array_key_exists( $machineId, $wgKnownMysqlServers ) ) {
-                                                       # Try to open, return false on failure
-                                                       $params = $wgKnownDBServers[$machineId];
-                                                       $db = Database::newFromParams( $params['server'], $params['user'], $params['password'],
-                                                               $dbName, 1, DBO_IGNORE );
-                                               }
-                                       }
-                                       if ( $db->isOpen() ) {
-                                               $index = $db->strencode( $index );
-                                               $res = $db->query( "SELECT blob_data FROM $dbName.$tblName " .
-                                                       "WHERE blob_index='$index' " . $this->getSelectOptions(), $fname );
-                                               $row = $db->fetchObject( $res );
-                                               $text = $row->text_data;
-                                       }
-                               }
-                               break;
-                       case 'file':
-                               # File locations are of the form file://<filename>, relative to the current directory
-                               if ( preg_match( '/^file:\/\/(.*)$', $location, $matches ) )
-                               $filename = strstr( $location, 'file://' );
-                               $text = @file_get_contents( $matches[1] );
-               }
-               if ( $text !== false ) {
-                       # Got text, now we need to interpret it
-                       # The first line contains information about how to do this
-                       $p = strpos( $text, '\n' );
-                       $type = substr( $text, 0, $p );
-                       $text = substr( $text, $p + 1 );
-                       switch ( $type ) {
-                               case 'plain':
-                                       break;
-                               case 'gzip':
-                                       $text = gzinflate( $text );
-                                       break;
-                               case 'object':
-                                       $object = unserialize( $text );
-                                       $text = $object->getItem( $hash );
-                                       break;
-                               default:
-                                       $text = false;
-                       }
-               }
-               wfProfileOut( $fname );
-               return $text;
-       }
-
        /**
         * Note that getContent/loadContent may follow redirects if
         * not told otherwise, and so may cause a change to mTitle.
index 0c44b34..ebc4714 100644 (file)
  */
 class HistoryBlob
 {
-       function setMeta() {}
+       # setMeta and getMeta currently aren't used for anything, I just thought they might be useful in the future
+       # The meta value is a single string
+       function setMeta( $meta ) {}
+
+       # Gets the meta-value
        function getMeta() {}
+
+       # Adds an item of text, returns a stub object which points to the item
+       # You must call setLocation() on the stub object before storing it to the database
        function addItem() {}
-       function getItem() {}
+
+       # Get item by hash
+       function getItem( $hash ) {}
+       
+       # Set the "default text"
+       # This concept is an odd property of the current DB schema, whereby each text item has a revision
+       # associated with it. The default text is the text of the associated revision. There may, however, 
+       # be other revisions in the same object
+       function setText() {}
+
+       # Get default text. This is called from Article::getRevisionText()
+       function getText() {}
 }
 
 /**
  * The real object
  * @package MediaWiki
  */
-class ConcatenatedGzipHistoryBlob
+class ConcatenatedGzipHistoryBlob extends HistoryBlob
 {
-       /* private */ var $mVersion = 0, $mCompressed = false, $mItems = array();
+       /* private */ var $mVersion = 0, $mCompressed = false, $mItems = array(), $mDefaultHash = '';
+       /* private */ var $mFast = 0, $mSize = 0;
 
-       function HistoryBlob() {
+       function ConcatenatedGzipHistoryBlob() {
                if ( !function_exists( 'gzdeflate' ) ) {
                        die( "Need zlib support to read or write this kind of history object (ConcatenatedGzipHistoryBlob)\n" );
                }
@@ -42,14 +61,28 @@ class ConcatenatedGzipHistoryBlob
        
        function addItem( $text ) {
                $this->uncompress();
-               $this->mItems[md5($text)] = $text;
+               $hash = md5( $text );
+               $this->mItems[$hash] = $text;
+               $this->mSize += strlen( $text );
+
+               $stub = new HistoryBlobStub( $hash );
+               return $stub;
        }
 
        function getItem( $hash ) {
-               $this->compress();
-               return $this->mItems[$hash];
+               $this->uncompress();
+               if ( array_key_exists( $hash, $this->mItems ) ) {
+                       return $this->mItems[$hash];
+               } else {
+                       return false;
+               }
        }
 
+       function removeItem( $hash ) {
+               $this->mSize -= strlen( $this->mItems[$hash] );
+               unset( $this->mItems[$hash] );
+       }
+       
        function compress() {
                if ( !$this->mCompressed  ) {
                        $this->mItems = gzdeflate( serialize( $this->mItems ) );
@@ -60,15 +93,81 @@ class ConcatenatedGzipHistoryBlob
        function uncompress() { 
                if ( $this->mCompressed ) {
                        $this->mItems = unserialize( gzinflate( $this->mItems ) );
+                       $this->mCompressed = false;
                }
        }
 
+       function getText() {
+               $this->uncompress();
+               return $this->getItem( $this->mDefaultHash );
+       }
+       
+       function setText( $text ) {
+               $this->uncompress();
+               $stub = $this->addItem( $text );
+               $this->mDefaultHash = $stub->mHash;
+       }
+
        function __sleep() {
-               compress();
+               $this->compress();
+               return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
        }
 
        function __wakeup() {
-               uncompress();
+               $this->uncompress();
+       }
+
+       # Determines if this object is happy
+       function isHappy( $maxFactor, $factorThreshold ) {
+               if ( count( $this->mItems ) == 0 ) {
+                       return true;
+               }
+               if ( $this->mFast ) {
+                       $this->uncompress();
+                       $record = serialize( $this->mItems );
+                       $size = strlen( $record );
+                       $avgUncompressed = $size / count( $this->mItems );
+                       $compressed = strlen( gzdeflate( $record ) );
+
+                       if ( $compressed < $factorThreshold * 1024 ) {
+                               return true;
+                       } else {
+                               return $avgUncompressed * $maxFactor < $compressed;
+                       }
+               } else {
+                       return count( $this->mItems ) <= 10;
+               }
+       }
+}
+
+class HistoryBlobStub
+{
+       var $mOldId, $mHash;
+
+       function HistoryBlobStub( $hash = '', $oldid = 0 ) {
+               $this->mHash = $hash;
+       }
+       
+       # Sets the location (old_id) of the main object to which this object points
+       function setLocation( $id ) {
+               $this->mOldId = $id;
+       }
+       
+       function getText() {
+               $dbr =& wfGetDB( DB_SLAVE );
+               $row = $dbr->selectRow( 'old', array( 'old_flags', 'old_text' ), array( 'old_id' => $this->mOldId ) );
+               if ( !$row || $row->old_flags != 'object' ) {
+                       return false;
+               }
+               $obj = unserialize( $row->old_text );
+               if ( !is_object( $obj ) ) {
+                       $obj = unserialize( $obj );
+               }
+               return $obj->getItem( $this->mHash );
+       }
+
+       function getHash() {
+               return $this->mHash;
        }
 }
 ?>
index ae39a58..c7e62cb 100644 (file)
@@ -74,6 +74,7 @@ require_once( 'Parser.php' );
 require_once( 'ParserCache.php' );
 require_once( 'WebRequest.php' );
 require_once( 'LoadBalancer.php' );
+require_once( 'HistoryBlob.php' );
 
 $wgRequest = new WebRequest();
 
index c88396b..d5159ba 100644 (file)
@@ -51,4 +51,153 @@ function compressPage( $row ) {
        return true;
 }
 
+define( 'LS_INDIVIDUAL', 0 );
+define( 'LS_CHUNKED', 1 );
+
+function compressWithConcat( $startId, $maxChunkSize, $maxChunkFactor, $factorThreshold, $beginDate, $endDate )
+{
+       $fname = 'compressWithConcat';
+       $loadStyle = LS_CHUNKED;
+       
+       $dbw =& wfGetDB( DB_MASTER );
+
+       # First get a list of all pages
+       $pageRes = $dbw->select( 'cur', array('cur_namespace', 'cur_title'), false, $fname );
+
+       # For each of those, get a list of revisions which fit the criteria
+       $conds = array();
+       if ( $beginDate ) {
+               $conds[] = "old_timestamp>'" . $beginDate . "'";
+       } 
+       if ( $endDate )  {
+               $conds[] = "old_timestamp<'" . $endDate . "'";
+       }
+       if ( $startId ) {
+               $conds[] = 'old_id>=' . $startId;
+       }
+       if ( $loadStyle == LS_CHUNKED ) {
+               $fields = array( 'old_id', 'old_flags', 'old_text' );
+               $revLoadOptions = 'FOR UPDATE';
+       } else {
+               $fields = array( 'old_id' );
+               $revLoadOptions = array();
+       }
+
+       while ( $pageRow = $dbw->fetchObject( $pageRes ) ) {
+               # Display progress
+               $titleObj = Title::makeTitle( $pageRow->cur_namespace, $pageRow->cur_title );
+               print $titleObj->getPrefixedDBkey() . " ";
+
+               # Load revisions
+               $revRes = $dbw->select( 'old', $fields,
+                       array( 'old_namespace' => $pageRow->cur_namespace, 'old_title' => $pageRow->cur_title ) + $conds, 
+                       $fname,
+                       $revLoadOptions
+               );
+               $revs = array();
+               while ( $revRow = $dbw->fetchObject( $revRes ) ) {
+                       $revs[] = $revRow;
+               }
+               
+               if ( count( $revs ) < 2) {
+                       # No revisions matching, no further processing
+                       print "\n";
+                       continue;
+               }
+
+               # For each chunk
+               $i = 0;
+               while ( $i < count( $revs ) ) {
+                       if ( $i < count( $revs ) - $maxChunkSize ) {
+                               $thisChunkSize = $maxChunkSize;
+                       } else {
+                               $thisChunkSize = count( $revs ) - $i;
+                       }
+
+                       $chunk = new ConcatenatedGzipHistoryBlob();
+                       $stubs = array();
+                       $dbw->begin();
+                       $usedChunk = false;
+                       $primaryOldid = $revs[$i]->old_id;
+                       
+                       # Get the text of each revision and add it to the object
+                       for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy( $maxChunkFactor, $factorThreshold ); $j++ ) {
+                               $oldid = $revs[$i + $j]->old_id;
+                               
+                               # Get text
+                               if ( $loadStyle == LS_INDIVIDUAL ) {
+                                       $textRow = $dbw->selectRow( 'old', 
+                                               array( 'old_flags', 'old_text' ),
+                                               array( 'old_id' => $oldid ),
+                                               $fname,
+                                               'FOR UPDATE'
+                                       );
+                                       $text = Article::getRevisionText( $textRow );
+                               } else {
+                                       $text = Article::getRevisionText( $revs[$i + $j] );
+                               }
+
+                               if ( $text === false ) {
+                                       print "\nError, unable to get text in old_id $oldid\n";
+                                       #$dbw->delete( 'old', array( 'old_id' => $oldid ) );
+                               }
+
+                               if ( $j == 0 ) {
+                                       $chunk->setText( $text );
+                                       print '.';
+                               } else {
+                                       # Don't make a stub if it's going to be longer than the article
+                                       # Stubs are typically about 100 bytes
+                                       if ( strlen( $text ) < 120 ) {
+                                               $stub = false;
+                                               print 'x';
+                                       } else {
+                                               $stub = $chunk->addItem( $text );
+                                               $stub->setLocation( $primaryOldid );
+                                               $hash = $stub->getHash();
+                                               $stub = serialize( $stub );
+                                               print '.';
+                                               $usedChunk = true;
+                                       }
+                                       $stubs[$j] = $stub;
+                               }
+                       }
+                       $thisChunkSize = $j;
+                       
+                       # If we couldn't actually use any stubs because the pages were too small, do nothing
+                       if ( $usedChunk ) {
+                               # Store the main object
+                               $dbw->update( 'old',
+                                       array( /* SET */
+                                               'old_text' => serialize( $chunk ),
+                                               'old_flags' => 'object',
+                                       ), array( /* WHERE */
+                                               'old_id' => $primaryOldid
+                                       )
+                               );
+
+                               # Store the stub objects
+                               for ( $j = 1; $j < $thisChunkSize; $j++ ) {
+                                       # Skip if not compressing
+                                       if ( $stubs[$j] !== false ) {
+                                               $dbw->update( 'old',
+                                                       array( /* SET */
+                                                               'old_text' => $stubs[$j],
+                                                               'old_flags' => 'object',
+                                                       ), array( /* WHERE */
+                                                               'old_id' => $revs[$i + $j]->old_id
+                                                       )
+                                               );
+                                       }
+                               }
+                       }
+                       # Done, next
+                       print "/";
+                       $dbw->commit();
+                       $i += $thisChunkSize;
+               }
+               print "\n";
+       }
+       return true;
+}
 ?>
index d1f3f06..865fceb 100644 (file)
@@ -7,6 +7,33 @@
  */
 
 /** */
+
+/**
+ * Usage: 
+ *
+ * Non-wikimedia
+ * php compressOld.php [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
+ *
+ * Wikimedia
+ * php compressOld.php <database> [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
+ *     [-f <max-factor>] [-h <factor-threshold>]
+ *
+ * <type> is either:
+ *   gzip: compress revisions independently
+ *   concat: concatenate revisions and compress in chunks (default)
+ * 
+ * <start-id> is the old_id to start from
+ * 
+ * The following options apply only to the concat type:
+ *    <begin-date> is the earliest date to check for uncompressed revisions
+ *    <end-date> is the latest revision date to compress
+ *    <chunk-size> is the maximum number of revisions in a concat chunk
+ *    <max-factor> is the maximum ratio of compressed chunk bytes to uncompressed avg. revision bytes
+ *    <factor-threshold> is a minimum number of KB, where <max-factor> cuts in
+ *
+ */
+$optionsWithArgs = array( 't', 'c', 's', 'f', 'h' );
 require_once( "commandLine.inc" );
 require_once( "compressOld.inc" );
 
@@ -16,19 +43,39 @@ if( !function_exists( "gzdeflate" ) ) {
        die();
 }
 
+$defaults = array( 
+       't' => 'concat',
+       'c' => 20,
+       's' => 0,
+       'f' => 3,
+       'h' => 100,
+       'b' => '',
+       'e' => '',
+);
+
+$args = $args + $defaults;
+
+if ( $args['t'] != 'concat' && $args['t'] != 'gzip' ) {
+       print "Type \"{$args['t']}\" not supported\n";
+}
+
 print "Depending on the size of your database this may take a while!\n";
 print "If you abort the script while it's running it shouldn't harm anything,\n";
 print "but if you haven't backed up your data, you SHOULD abort now!\n\n";
 print "Press control-c to abort first (will proceed automatically in 5 seconds)\n";
-sleep(5);
+#sleep(5);
+
+$success = true;
+if ( $args['t'] == 'concat' ) {
+       $success = compressWithConcat( $args['s'], $args['c'], $args['f'], $args['h'], $args['b'], $args['e'] );
+} else {
+       compressOldPages( $args['s'] );
+} 
 
-$n = 0;
-if( !empty( $argv[1] ) ) {
-       $n = intval( $argv[1] );
+if ( $success ) {
+       print "Done.\n";
 }
-compressOldPages( $n );
 
-print "Done.\n";
 exit();
 
 ?>