From: Tim Starling <tstarling@users.mediawiki.org>
Date: Sat, 30 Oct 2004 14:39:40 +0000 (+0000)
Subject: Backporting concatenated gzip history compression from SCHEMA_WORK. Also made a few... 
X-Git-Tag: 1.5.0alpha1~1413
X-Git-Url: https://git.cyclocoop.org/%242?a=commitdiff_plain;h=7d1442e76f6cf626e746764c9e1737030db6c277;p=lhc%2Fweb%2Fwiklou.git

Backporting concatenated gzip history compression from SCHEMA_WORK. Also made a few tweaks to the compression script, mainly to make it faster for the purposes of a testing sequence I'm currently running on it. Will report to wikitech-l once testing is done.
---

diff --git a/includes/Article.php b/includes/Article.php
index e319524b33..0f18585d0e 100644
--- a/includes/Article.php
+++ b/includes/Article.php
@@ -91,7 +91,19 @@ class Article {
 			# as pages are saved if $wgCompressRevisions is set.
 			$text = gzinflate( $text );
 		}
-		
+			
+		if( in_array( 'object', $flags ) ) {
+			# Generic compressed storage
+			$obj = unserialize( $text );
+
+			# Bugger, corrupted my test database by double-serializing
+			if ( !is_object( $obj ) ) {
+				$obj = unserialize( $obj );
+			}
+
+			$text = $obj->getText();
+		}
+	
 		global $wgLegacyEncoding;
 		if( $wgLegacyEncoding && !in_array( 'utf-8', $flags ) ) {
 			# Old revisions kept around in a legacy encoding?
@@ -99,11 +111,6 @@ class Article {
 			global $wgInputEncoding, $wgContLang;
 			$text = $wgContLang->iconv( $wgLegacyEncoding, $wgInputEncoding, $text );
 		}
-		
-		if( in_array( 'link', $flags ) ) {
-			# Handle link type
-			$text = Article::followLink( $text );
-		}
 		return $text;
 	}
 
@@ -137,131 +144,6 @@ class Article {
 		return implode( ',', $flags );
 	}
 
-	/**
-	 * Returns the text associated with a "link" type old table row
-	 * @static
-	 * @param mixed $link
-	 * @return string $text|false
-	 */
-	function followLink( $link ) {
-		# Split the link into fields and values
-		$lines = explode( '\n', $link );
-		$hash = '';
-		$locations = array();
-		foreach ( $lines as $line ) {
-			# Comments
-			if ( $line{0} == '#' ) {
-				continue;
-			}
-			# Field/value pairs
-			if ( preg_match( '/^(.*?)\s*:\s*(.*)$/', $line, $matches ) ) {
-				$field = strtolower($matches[1]);
-				$value = $matches[2];
-				if ( $field == 'hash' ) {
-					$hash = $value;
-				} elseif ( $field == 'location' ) {
-					$locations[] = $value;
-				}
-			}
-		}
-
-		if ( $hash === '' ) {
-			return false;
-		}
-
-		# Look in each specified location for the text
-		$text = false;
-		foreach ( $locations as $location ) {
-			$text = Article::fetchFromLocation( $location, $hash );
-			if ( $text !== false ) {
-				break;
-			}
-		}
-
-		return $text;
-	}
-
-	/**
-	 * @static
-	 * @param $location
-	 * @param $hash
-	 */
-	function fetchFromLocation( $location, $hash ) {
-		global $wgLoadBalancer;
-		$fname = 'fetchFromLocation';
-		wfProfileIn( $fname );
-
-		$p = strpos( $location, ':' );
-		if ( $p === false ) {
-			wfProfileOut( $fname );
-			return false;
-		}
-
-		$type = substr( $location, 0, $p );
-		$text = false;
-		switch ( $type ) {
-			case 'mysql':
-				# MySQL locations are specified by mysql://<machineID>/<dbname>/<tblname>/<index>
-				# Machine ID 0 is the current connection
-				if ( preg_match( '/^mysql:\/\/(\d+)\/([A-Za-z_]+)\/([A-Za-z_]+)\/([A-Za-z_]+)$/',
-				  $location, $matches ) ) {
-					$machineID = $matches[1];
-					$dbName = $matches[2];
-					$tblName = $matches[3];
-					$index = $matches[4];
-					if ( $machineID == 0 ) {
-						# Current connection
-						$db =& $this->getDB();
-					} else {
-						# Alternate connection
-						$db =& $wgLoadBalancer->getConnection( $machineID );
-
-						if ( array_key_exists( $machineId, $wgKnownMysqlServers ) ) {
-							# Try to open, return false on failure
-							$params = $wgKnownDBServers[$machineId];
-							$db = Database::newFromParams( $params['server'], $params['user'], $params['password'],
-								$dbName, 1, DBO_IGNORE );
-						}
-					}
-					if ( $db->isOpen() ) {
-						$index = $db->strencode( $index );
-						$res = $db->query( "SELECT blob_data FROM $dbName.$tblName " .
-							"WHERE blob_index='$index' " . $this->getSelectOptions(), $fname );
-						$row = $db->fetchObject( $res );
-						$text = $row->text_data;
-					}
-				}
-				break;
-			case 'file':
-				# File locations are of the form file://<filename>, relative to the current directory
-				if ( preg_match( '/^file:\/\/(.*)$', $location, $matches ) )
-				$filename = strstr( $location, 'file://' );
-				$text = @file_get_contents( $matches[1] );
-		}
-		if ( $text !== false ) {
-			# Got text, now we need to interpret it
-			# The first line contains information about how to do this
-			$p = strpos( $text, '\n' );
-			$type = substr( $text, 0, $p );
-			$text = substr( $text, $p + 1 );
-			switch ( $type ) {
-				case 'plain':
-					break;
-				case 'gzip':
-					$text = gzinflate( $text );
-					break;
-				case 'object':
-					$object = unserialize( $text );
-					$text = $object->getItem( $hash );
-					break;
-				default:
-					$text = false;
-			}
-		}
-		wfProfileOut( $fname );
-		return $text;
-	}
-
 	/**
 	 * Note that getContent/loadContent may follow redirects if
 	 * not told otherwise, and so may cause a change to mTitle.
diff --git a/includes/HistoryBlob.php b/includes/HistoryBlob.php
index 0c44b34415..ebc47145d4 100644
--- a/includes/HistoryBlob.php
+++ b/includes/HistoryBlob.php
@@ -10,21 +10,40 @@
  */
 class HistoryBlob
 {
-	function setMeta() {}
+	# setMeta and getMeta currently aren't used for anything, I just thought they might be useful in the future
+	# The meta value is a single string
+	function setMeta( $meta ) {}
+
+	# Gets the meta-value
 	function getMeta() {}
+
+	# Adds an item of text, returns a stub object which points to the item
+	# You must call setLocation() on the stub object before storing it to the database
 	function addItem() {}
-	function getItem() {}
+
+	# Get item by hash
+	function getItem( $hash ) {}
+	
+	# Set the "default text"
+	# This concept is an odd property of the current DB schema, whereby each text item has a revision
+	# associated with it. The default text is the text of the associated revision. There may, however, 
+	# be other revisions in the same object
+	function setText() {}
+
+	# Get default text. This is called from Article::getRevisionText()
+	function getText() {}
 }
 
 /**
  * The real object
  * @package MediaWiki
  */
-class ConcatenatedGzipHistoryBlob
+class ConcatenatedGzipHistoryBlob extends HistoryBlob
 {
-	/* private */ var $mVersion = 0, $mCompressed = false, $mItems = array();
+	/* private */ var $mVersion = 0, $mCompressed = false, $mItems = array(), $mDefaultHash = '';
+	/* private */ var $mFast = 0, $mSize = 0;
 
-	function HistoryBlob() {
+	function ConcatenatedGzipHistoryBlob() {
 		if ( !function_exists( 'gzdeflate' ) ) {
 			die( "Need zlib support to read or write this kind of history object (ConcatenatedGzipHistoryBlob)\n" );
 		}
@@ -42,14 +61,28 @@ class ConcatenatedGzipHistoryBlob
 	
 	function addItem( $text ) {
 		$this->uncompress();
-		$this->mItems[md5($text)] = $text;
+		$hash = md5( $text );
+		$this->mItems[$hash] = $text;
+		$this->mSize += strlen( $text );
+
+		$stub = new HistoryBlobStub( $hash );
+		return $stub;
 	}
 
 	function getItem( $hash ) {
-		$this->compress();
-		return $this->mItems[$hash];
+		$this->uncompress();
+		if ( array_key_exists( $hash, $this->mItems ) ) {
+			return $this->mItems[$hash];
+		} else {
+			return false;
+		}
 	}
 
+	function removeItem( $hash ) {
+		$this->mSize -= strlen( $this->mItems[$hash] );
+		unset( $this->mItems[$hash] );
+	}
+	
 	function compress() {
 		if ( !$this->mCompressed  ) {
 			$this->mItems = gzdeflate( serialize( $this->mItems ) );
@@ -60,15 +93,81 @@ class ConcatenatedGzipHistoryBlob
 	function uncompress() { 
 		if ( $this->mCompressed ) {
 			$this->mItems = unserialize( gzinflate( $this->mItems ) );
+			$this->mCompressed = false;
 		}
 	}
 
+	function getText() {
+		$this->uncompress();
+		return $this->getItem( $this->mDefaultHash );
+	}
+	
+	function setText( $text ) {
+		$this->uncompress();
+		$stub = $this->addItem( $text );
+		$this->mDefaultHash = $stub->mHash;
+	}
+
 	function __sleep() {
-		compress();
+		$this->compress();
+		return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
 	}
 
 	function __wakeup() {
-		uncompress();
+		$this->uncompress();
+	}
+
+	# Determines if this object is happy
+	function isHappy( $maxFactor, $factorThreshold ) {
+		if ( count( $this->mItems ) == 0 ) {
+			return true;
+		}
+		if ( $this->mFast ) {
+			$this->uncompress();
+			$record = serialize( $this->mItems );
+			$size = strlen( $record );
+			$avgUncompressed = $size / count( $this->mItems );
+			$compressed = strlen( gzdeflate( $record ) );
+
+			if ( $compressed < $factorThreshold * 1024 ) {
+				return true;
+			} else {
+				return $avgUncompressed * $maxFactor < $compressed;
+			}
+		} else {
+			return count( $this->mItems ) <= 10;
+		}
+	}
+}
+
+class HistoryBlobStub
+{
+	var $mOldId, $mHash;
+
+	function HistoryBlobStub( $hash = '', $oldid = 0 ) {
+		$this->mHash = $hash;
+	}
+	
+	# Sets the location (old_id) of the main object to which this object points
+	function setLocation( $id ) {
+		$this->mOldId = $id;
+	}
+	
+	function getText() {
+		$dbr =& wfGetDB( DB_SLAVE );
+		$row = $dbr->selectRow( 'old', array( 'old_flags', 'old_text' ), array( 'old_id' => $this->mOldId ) );
+		if ( !$row || $row->old_flags != 'object' ) {
+			return false;
+		}
+		$obj = unserialize( $row->old_text );
+		if ( !is_object( $obj ) ) {
+			$obj = unserialize( $obj );
+		}
+		return $obj->getItem( $this->mHash );
+	}
+
+	function getHash() {
+		return $this->mHash;
 	}
 }
 ?>
diff --git a/includes/Setup.php b/includes/Setup.php
index ae39a584f7..c7e62cb263 100644
--- a/includes/Setup.php
+++ b/includes/Setup.php
@@ -74,6 +74,7 @@ require_once( 'Parser.php' );
 require_once( 'ParserCache.php' );
 require_once( 'WebRequest.php' );
 require_once( 'LoadBalancer.php' );
+require_once( 'HistoryBlob.php' );
 
 $wgRequest = new WebRequest();
 
diff --git a/maintenance/compressOld.inc b/maintenance/compressOld.inc
index c88396b823..d5159baf29 100644
--- a/maintenance/compressOld.inc
+++ b/maintenance/compressOld.inc
@@ -51,4 +51,153 @@ function compressPage( $row ) {
 	return true;
 }
 
+define( 'LS_INDIVIDUAL', 0 );
+define( 'LS_CHUNKED', 1 );
+
+function compressWithConcat( $startId, $maxChunkSize, $maxChunkFactor, $factorThreshold, $beginDate, $endDate )
+{
+	$fname = 'compressWithConcat';
+	$loadStyle = LS_CHUNKED;
+	
+	$dbw =& wfGetDB( DB_MASTER );
+
+	# First get a list of all pages
+	$pageRes = $dbw->select( 'cur', array('cur_namespace', 'cur_title'), false, $fname );
+
+	# For each of those, get a list of revisions which fit the criteria
+	$conds = array();
+	if ( $beginDate ) {
+		$conds[] = "old_timestamp>'" . $beginDate . "'";
+	} 
+	if ( $endDate )  {
+		$conds[] = "old_timestamp<'" . $endDate . "'";
+	}
+	if ( $startId ) {
+		$conds[] = 'old_id>=' . $startId;
+	}
+	if ( $loadStyle == LS_CHUNKED ) {
+		$fields = array( 'old_id', 'old_flags', 'old_text' );
+		$revLoadOptions = 'FOR UPDATE';
+	} else {
+		$fields = array( 'old_id' );
+		$revLoadOptions = array();
+	}
+
+	while ( $pageRow = $dbw->fetchObject( $pageRes ) ) {
+		# Display progress
+		$titleObj = Title::makeTitle( $pageRow->cur_namespace, $pageRow->cur_title );
+		print $titleObj->getPrefixedDBkey() . " ";
+
+		# Load revisions
+		$revRes = $dbw->select( 'old', $fields,
+			array( 'old_namespace' => $pageRow->cur_namespace, 'old_title' => $pageRow->cur_title ) + $conds, 
+			$fname,
+			$revLoadOptions
+		);
+		$revs = array();
+		while ( $revRow = $dbw->fetchObject( $revRes ) ) {
+			$revs[] = $revRow;
+		}
+		
+		if ( count( $revs ) < 2) {
+			# No revisions matching, no further processing
+			print "\n";
+			continue;
+		}
+
+		# For each chunk
+		$i = 0;
+		while ( $i < count( $revs ) ) {
+			if ( $i < count( $revs ) - $maxChunkSize ) {
+				$thisChunkSize = $maxChunkSize;
+			} else {
+				$thisChunkSize = count( $revs ) - $i;
+			}
+
+			$chunk = new ConcatenatedGzipHistoryBlob();
+			$stubs = array();
+			$dbw->begin();
+			$usedChunk = false;
+			$primaryOldid = $revs[$i]->old_id;
+			
+			# Get the text of each revision and add it to the object
+			for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy( $maxChunkFactor, $factorThreshold ); $j++ ) {
+				$oldid = $revs[$i + $j]->old_id;
+				
+				# Get text
+				if ( $loadStyle == LS_INDIVIDUAL ) {
+					$textRow = $dbw->selectRow( 'old', 
+						array( 'old_flags', 'old_text' ),
+						array( 'old_id' => $oldid ),
+						$fname,
+						'FOR UPDATE'
+					);
+					$text = Article::getRevisionText( $textRow );
+				} else {
+					$text = Article::getRevisionText( $revs[$i + $j] );
+				}
+
+				if ( $text === false ) {
+					print "\nError, unable to get text in old_id $oldid\n";
+					#$dbw->delete( 'old', array( 'old_id' => $oldid ) );
+				}
+
+				if ( $j == 0 ) {
+					$chunk->setText( $text );
+					print '.';
+				} else {
+					# Don't make a stub if it's going to be longer than the article
+					# Stubs are typically about 100 bytes
+					if ( strlen( $text ) < 120 ) {
+						$stub = false;
+						print 'x';
+					} else {
+						$stub = $chunk->addItem( $text );
+						$stub->setLocation( $primaryOldid );
+						$hash = $stub->getHash();
+						$stub = serialize( $stub );
+						print '.';
+						$usedChunk = true;
+					}
+					$stubs[$j] = $stub;
+				}
+			}
+			$thisChunkSize = $j;
+			
+			# If we couldn't actually use any stubs because the pages were too small, do nothing
+			if ( $usedChunk ) {
+				# Store the main object
+				$dbw->update( 'old',
+					array( /* SET */
+						'old_text' => serialize( $chunk ),
+						'old_flags' => 'object',
+					), array( /* WHERE */
+						'old_id' => $primaryOldid
+					)
+				);
+
+				# Store the stub objects
+				for ( $j = 1; $j < $thisChunkSize; $j++ ) {
+					# Skip if not compressing
+					if ( $stubs[$j] !== false ) {
+						$dbw->update( 'old',
+							array( /* SET */
+								'old_text' => $stubs[$j],
+								'old_flags' => 'object',
+							), array( /* WHERE */
+								'old_id' => $revs[$i + $j]->old_id
+							)
+						);
+					}
+				}
+			}
+			# Done, next
+			print "/";
+			$dbw->commit();
+			$i += $thisChunkSize;
+		}
+		print "\n";
+	}
+	return true;
+}
 ?>
diff --git a/maintenance/compressOld.php b/maintenance/compressOld.php
index d1f3f066c6..865fcebd9c 100644
--- a/maintenance/compressOld.php
+++ b/maintenance/compressOld.php
@@ -7,6 +7,33 @@
  */
 
 /** */
+
+/**
+ * Usage: 
+ *
+ * Non-wikimedia
+ * php compressOld.php [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
+ *
+ * Wikimedia
+ * php compressOld.php <database> [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
+ *     [-f <max-factor>] [-h <factor-threshold>]
+ *
+ * <type> is either:
+ *   gzip: compress revisions independently
+ *   concat: concatenate revisions and compress in chunks (default)
+ * 
+ * <start-id> is the old_id to start from
+ * 
+ * The following options apply only to the concat type:
+ *    <begin-date> is the earliest date to check for uncompressed revisions
+ *    <end-date> is the latest revision date to compress
+ *    <chunk-size> is the maximum number of revisions in a concat chunk
+ *    <max-factor> is the maximum ratio of compressed chunk bytes to uncompressed avg. revision bytes
+ *    <factor-threshold> is a minimum number of KB, where <max-factor> cuts in
+ *
+ */
+ 
+$optionsWithArgs = array( 't', 'c', 's', 'f', 'h' );
 require_once( "commandLine.inc" );
 require_once( "compressOld.inc" );
 
@@ -16,19 +43,39 @@ if( !function_exists( "gzdeflate" ) ) {
 	die();
 }
 
+$defaults = array( 
+	't' => 'concat',
+	'c' => 20,
+	's' => 0,
+	'f' => 3,
+	'h' => 100,
+	'b' => '',
+	'e' => '',
+);
+
+$args = $args + $defaults;
+
+if ( $args['t'] != 'concat' && $args['t'] != 'gzip' ) {
+	print "Type \"{$args['t']}\" not supported\n";
+}
+
 print "Depending on the size of your database this may take a while!\n";
 print "If you abort the script while it's running it shouldn't harm anything,\n";
 print "but if you haven't backed up your data, you SHOULD abort now!\n\n";
 print "Press control-c to abort first (will proceed automatically in 5 seconds)\n";
-sleep(5);
+#sleep(5);
+
+$success = true;
+if ( $args['t'] == 'concat' ) {
+	$success = compressWithConcat( $args['s'], $args['c'], $args['f'], $args['h'], $args['b'], $args['e'] );
+} else {
+	compressOldPages( $args['s'] );
+} 
 
-$n = 0;
-if( !empty( $argv[1] ) ) {
-	$n = intval( $argv[1] );
+if ( $success ) {
+	print "Done.\n";
 }
-compressOldPages( $n );
 
-print "Done.\n";
 exit();
 
 ?>