From: Tim Starling <tstarling@users.mediawiki.org>
Date: Sun, 14 Aug 2005 07:57:05 +0000 (+0000)
Subject: Moving compression scripts from maintenance to maintenance/storage
X-Git-Tag: 1.6.0~1987
X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/exercices/journal.php?a=commitdiff_plain;h=31c064f436c8c551d921fa238d3172b60ae0faba;p=lhc%2Fweb%2Fwiklou.git

Moving compression scripts from maintenance to maintenance/storage
---

diff --git a/maintenance/compressOld.inc b/maintenance/compressOld.inc
deleted file mode 100644
index f6a9f43035..0000000000
--- a/maintenance/compressOld.inc
+++ /dev/null
@@ -1,230 +0,0 @@
-<?php
-/**
- * @package MediaWiki
- * @subpackage Maintenance
- */
-
-/** */
-require_once( 'Revision.php' );
-
-/** @todo document */
-function compressOldPages( $start = 0 ) {
-	$fname = 'compressOldPages';
-
-	$chunksize = 50;
-	print "Starting from old_id $start...\n";
-	$dbw =& wfGetDB( DB_MASTER );
-	$old = $dbw->tableName( 'old' );
-	do {
-		$end = $start + $chunksize;
-		$res = $dbw->select( 'old', array( 'old_id','old_flags','old_namespace','old_title','old_text' ),
-			"old_id>=$start", $fname, array( 'ORDER BY' => 'old_id', 'LIMIT' => $chunksize, 'FOR UPDATE' ) );
-		if( $dbw->numRows( $res ) == 0 ) {
-			break;
-		}
-		$last = $start;
-		while( $row = $dbw->fetchObject( $res ) ) {
-			# print "  {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n";
-			compressPage( $row );
-			$last = $row->old_id;
-		}
-		$dbw->freeResult( $res );
-		$start = $last + 1; # Deletion may leave long empty stretches
-		print "$start...\n";
-	} while( true );
-}
-
-/** @todo document */
-function compressPage( $row ) {
-	$fname = 'compressPage';
-	if( false !== strpos( $row->old_flags, "gzip" ) ) {
-		print "Already compressed row {$row->old_id}?\n";
-		return false;
-	}
-	$dbw =& wfGetDB( DB_MASTER );
-	$flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip";
-	$compress = gzdeflate( $row->old_text );
-	$dbw->update( 'old', 
-		array( /* SET */
-			'old_flags' => $flags,
-			'old_text' => $compress
-		), array( /* WHERE */
-			'old_id' => $row->old_id
-		), $fname, 'LIMIT 1'
-	);
-	return true;
-}
-
-define( 'LS_INDIVIDUAL', 0 );
-define( 'LS_CHUNKED', 1 );
-
-/** @todo document */
-function compressWithConcat( $startId, $maxChunkSize, $maxChunkFactor, $factorThreshold, $beginDate, $endDate )
-{
-	$fname = 'compressWithConcat';
-	$loadStyle = LS_CHUNKED;
-	
-	$dbr =& wfGetDB( DB_SLAVE );
-	$dbw =& wfGetDB( DB_MASTER );
-
-	# Get all articles by page_id
-	$maxPageId = $dbr->selectField( 'page', 'max(page_id)', '', $fname );
-	$pageConds = array();
-
-	if ( $exclude_ns0 ) {
-		print "Excluding main namespace\n";
-		$pageConds[] = 'page_namespace<>0';
-	}
-	if ( $queryExtra ) {
-                $pageConds[] = $queryExtra;
-	}
-
-	# For each article, get a list of revisions which fit the criteria
-	# No recompression, use a condition on old_flags
-	$conds = array("old_flags NOT LIKE '%object%'");
-	
-	if ( $beginDate ) {
-		$conds[] = "rev_timestamp>'" . $beginDate . "'";
-	} 
-	if ( $endDate )  {
-		$conds[] = "rev_timestamp<'" . $endDate . "'";
-	}
-	if ( $loadStyle == LS_CHUNKED ) {
-		$tables = array( 'revision', 'text' );
-		$fields = array( 'rev_id', 'rev_text_id', 'old_flags', 'old_text' );
-		$conds[] = 'rev_text_id=old_id';
-		$revLoadOptions = 'FOR UPDATE';
-	} else {
-		$tables = array( 'revision' );
-		$fields = array( 'rev_id', 'rev_text_id' );
-		$revLoadOptions = array();
-	}
-
-	$oldReadsSinceLastSlaveWait = 0;	#check slave lag periodically
-	$totalMatchingRevisions = 0;
-	$masterPos = false;
-	for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) {
-		$pageRes = $dbr->select( 'page', array('page_id', 'page_namespace', 'page_title'), 
-			$pageConds + array('page_id' => $pageId), $fname );
-		if ( $dbr->numRows( $pageRes ) == 0 ) {
-			continue;
-		}
-		$pageRow = $dbr->fetchObject( $pageRes );
-
-		# Display progress
-		$titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title );
-		print "$pageId\t" . $titleObj->getPrefixedDBkey() . " ";
-
-		# Load revisions
-		$revRes = $dbw->select( $tables, $fields,
-			array( 'rev_page' => $pageRow->page_id ) + $conds, 
-			$fname,
-			$revLoadOptions
-		);
-		$revs = array();
-		while ( $revRow = $dbw->fetchObject( $revRes ) ) {
-			$revs[] = $revRow;
-		}
-		
-		if ( count( $revs ) < 2) {
-			# No revisions matching, no further processing
-			print "\n";
-			continue;
-		}
-
-		# For each chunk
-		$i = 0;
-		while ( $i < count( $revs ) ) {
-			if ( $i < count( $revs ) - $maxChunkSize ) {
-				$thisChunkSize = $maxChunkSize;
-			} else {
-				$thisChunkSize = count( $revs ) - $i;
-			}
-
-			$chunk = new ConcatenatedGzipHistoryBlob();
-			$stubs = array();
-			$dbw->begin();
-			$usedChunk = false;
-			$primaryOldid = $revs[$i]->rev_text_id;
-			
-			# Get the text of each revision and add it to the object
-			for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy( $maxChunkFactor, $factorThreshold ); $j++ ) {
-				$oldid = $revs[$i + $j]->rev_text_id;
-				
-				# Get text
-				if ( $loadStyle == LS_INDIVIDUAL ) {
-					$textRow = $dbw->selectRow( 'text', 
-						array( 'old_flags', 'old_text' ),
-						array( 'old_id' => $oldid ),
-						$fname,
-						'FOR UPDATE'
-					);
-					$text = Revision::getRevisionText( $textRow );
-				} else {
-					$text = Revision::getRevisionText( $revs[$i + $j] );
-				}
-
-				if ( $text === false ) {
-					print "\nError, unable to get text in old_id $oldid\n";
-					#$dbw->delete( 'old', array( 'old_id' => $oldid ) );
-				}
-
-				if ( $j == 0 ) {
-					$chunk->setText( $text );
-					print '.';
-				} else {
-					# Don't make a stub if it's going to be longer than the article
-					# Stubs are typically about 100 bytes
-					if ( strlen( $text ) < 120 ) {
-						$stub = false;
-						print 'x';
-					} else {
-						$stub = $chunk->addItem( $text );
-						$stub->setLocation( $primaryOldid );
-						$hash = $stub->getHash();
-						$stub = serialize( $stub );
-						print '.';
-						$usedChunk = true;
-					}
-					$stubs[$j] = $stub;
-				}
-			}
-			$thisChunkSize = $j;
-			
-			# If we couldn't actually use any stubs because the pages were too small, do nothing
-			if ( $usedChunk ) {
-				# Store the main object
-				$dbw->update( 'text',
-					array( /* SET */
-						'old_text' => serialize( $chunk ),
-						'old_flags' => 'object',
-					), array( /* WHERE */
-						'old_id' => $primaryOldid
-					)
-				);
-
-				# Store the stub objects
-				for ( $j = 1; $j < $thisChunkSize; $j++ ) {
-					# Skip if not compressing
-					if ( $stubs[$j] !== false ) {
-						$dbw->update( 'text',
-							array( /* SET */
-								'old_text' => $stubs[$j],
-								'old_flags' => 'object',
-							), array( /* WHERE */
-								'old_id' => $revs[$i + $j]->rev_text_id
-							)
-						);
-					}
-				}
-			}
-			# Done, next
-			print "/";
-			$dbw->commit();
-			$i += $thisChunkSize;
-		}
-		print "\n";
-	}
-	return true;
-}
-?>
diff --git a/maintenance/compressOld.php b/maintenance/compressOld.php
deleted file mode 100644
index 0d449c4c7c..0000000000
--- a/maintenance/compressOld.php
+++ /dev/null
@@ -1,83 +0,0 @@
-<?php
-/**
- * Compress the old table, old_flags=gzip
- *
- * @package MediaWiki
- * @subpackage Maintenance
- */
-
-/** */
-
-/**
- * Usage: 
- *
- * Non-wikimedia
- * php compressOld.php [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
- *
- * Wikimedia
- * php compressOld.php <database> [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
- *     [-f <max-factor>] [-h <factor-threshold>]
- *
- * <type> is either:
- *   gzip: compress revisions independently
- *   concat: concatenate revisions and compress in chunks (default)
- * 
- * <start-id> is the old_id to start from
- * 
- * The following options apply only to the concat type:
- *    <begin-date> is the earliest date to check for uncompressed revisions
- *    <end-date> is the latest revision date to compress
- *    <chunk-size> is the maximum number of revisions in a concat chunk
- *    <max-factor> is the maximum ratio of compressed chunk bytes to uncompressed avg. revision bytes
- *    <factor-threshold> is a minimum number of KB, where <max-factor> cuts in
- *
- */
-
-die( 'compressOld is known to be broken at the moment.' );
-
-$optionsWithArgs = array( 't', 'c', 's', 'f', 'h' );
-require_once( "commandLine.inc" );
-require_once( "compressOld.inc" );
-
-if( !function_exists( "gzdeflate" ) ) {
-	print "You must enable zlib support in PHP to compress old revisions!\n";
-	print "Please see http://www.php.net/manual/en/ref.zlib.php\n\n";
-	die();
-}
-
-$defaults = array( 
-	't' => 'concat',
-	'c' => 20,
-	's' => 0,
-	'f' => 3,
-	'h' => 100,
-	'b' => '',
-	'e' => '',
-);
-
-$args = $args + $defaults;
-
-if ( $args['t'] != 'concat' && $args['t'] != 'gzip' ) {
-	print "Type \"{$args['t']}\" not supported\n";
-}
-
-print "Depending on the size of your database this may take a while!\n";
-print "If you abort the script while it's running it shouldn't harm anything,\n";
-print "but if you haven't backed up your data, you SHOULD abort now!\n\n";
-print "Press control-c to abort first (will proceed automatically in 5 seconds)\n";
-#sleep(5);
-
-$success = true;
-if ( $args['t'] == 'concat' ) {
-	$success = compressWithConcat( $args['s'], $args['c'], $args['f'], $args['h'], $args['b'], $args['e'] );
-} else {
-	compressOldPages( $args['s'] );
-} 
-
-if ( $success ) {
-	print "Done.\n";
-}
-
-exit();
-
-?>
diff --git a/maintenance/storage/compressOld.inc b/maintenance/storage/compressOld.inc
new file mode 100644
index 0000000000..f6a9f43035
--- /dev/null
+++ b/maintenance/storage/compressOld.inc
@@ -0,0 +1,230 @@
+<?php
+/**
+ * @package MediaWiki
+ * @subpackage Maintenance
+ */
+
+/** */
+require_once( 'Revision.php' );
+
+/** @todo document */
+function compressOldPages( $start = 0 ) {
+	$fname = 'compressOldPages';
+
+	$chunksize = 50;
+	print "Starting from old_id $start...\n";
+	$dbw =& wfGetDB( DB_MASTER );
+	$old = $dbw->tableName( 'old' );
+	do {
+		$end = $start + $chunksize;
+		$res = $dbw->select( 'old', array( 'old_id','old_flags','old_namespace','old_title','old_text' ),
+			"old_id>=$start", $fname, array( 'ORDER BY' => 'old_id', 'LIMIT' => $chunksize, 'FOR UPDATE' ) );
+		if( $dbw->numRows( $res ) == 0 ) {
+			break;
+		}
+		$last = $start;
+		while( $row = $dbw->fetchObject( $res ) ) {
+			# print "  {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n";
+			compressPage( $row );
+			$last = $row->old_id;
+		}
+		$dbw->freeResult( $res );
+		$start = $last + 1; # Deletion may leave long empty stretches
+		print "$start...\n";
+	} while( true );
+}
+
+/** @todo document */
+function compressPage( $row ) {
+	$fname = 'compressPage';
+	if( false !== strpos( $row->old_flags, "gzip" ) ) {
+		print "Already compressed row {$row->old_id}?\n";
+		return false;
+	}
+	$dbw =& wfGetDB( DB_MASTER );
+	$flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip";
+	$compress = gzdeflate( $row->old_text );
+	$dbw->update( 'old', 
+		array( /* SET */
+			'old_flags' => $flags,
+			'old_text' => $compress
+		), array( /* WHERE */
+			'old_id' => $row->old_id
+		), $fname, 'LIMIT 1'
+	);
+	return true;
+}
+
+define( 'LS_INDIVIDUAL', 0 );
+define( 'LS_CHUNKED', 1 );
+
+/** @todo document */
+function compressWithConcat( $startId, $maxChunkSize, $maxChunkFactor, $factorThreshold, $beginDate, $endDate )
+{
+	$fname = 'compressWithConcat';
+	$loadStyle = LS_CHUNKED;
+	
+	$dbr =& wfGetDB( DB_SLAVE );
+	$dbw =& wfGetDB( DB_MASTER );
+
+	# Get all articles by page_id
+	$maxPageId = $dbr->selectField( 'page', 'max(page_id)', '', $fname );
+	$pageConds = array();
+
+	if ( $exclude_ns0 ) {
+		print "Excluding main namespace\n";
+		$pageConds[] = 'page_namespace<>0';
+	}
+	if ( $queryExtra ) {
+                $pageConds[] = $queryExtra;
+	}
+
+	# For each article, get a list of revisions which fit the criteria
+	# No recompression, use a condition on old_flags
+	$conds = array("old_flags NOT LIKE '%object%'");
+	
+	if ( $beginDate ) {
+		$conds[] = "rev_timestamp>'" . $beginDate . "'";
+	} 
+	if ( $endDate )  {
+		$conds[] = "rev_timestamp<'" . $endDate . "'";
+	}
+	if ( $loadStyle == LS_CHUNKED ) {
+		$tables = array( 'revision', 'text' );
+		$fields = array( 'rev_id', 'rev_text_id', 'old_flags', 'old_text' );
+		$conds[] = 'rev_text_id=old_id';
+		$revLoadOptions = 'FOR UPDATE';
+	} else {
+		$tables = array( 'revision' );
+		$fields = array( 'rev_id', 'rev_text_id' );
+		$revLoadOptions = array();
+	}
+
+	$oldReadsSinceLastSlaveWait = 0;	#check slave lag periodically
+	$totalMatchingRevisions = 0;
+	$masterPos = false;
+	for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) {
+		$pageRes = $dbr->select( 'page', array('page_id', 'page_namespace', 'page_title'), 
+			$pageConds + array('page_id' => $pageId), $fname );
+		if ( $dbr->numRows( $pageRes ) == 0 ) {
+			continue;
+		}
+		$pageRow = $dbr->fetchObject( $pageRes );
+
+		# Display progress
+		$titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title );
+		print "$pageId\t" . $titleObj->getPrefixedDBkey() . " ";
+
+		# Load revisions
+		$revRes = $dbw->select( $tables, $fields,
+			array( 'rev_page' => $pageRow->page_id ) + $conds, 
+			$fname,
+			$revLoadOptions
+		);
+		$revs = array();
+		while ( $revRow = $dbw->fetchObject( $revRes ) ) {
+			$revs[] = $revRow;
+		}
+		
+		if ( count( $revs ) < 2) {
+			# No revisions matching, no further processing
+			print "\n";
+			continue;
+		}
+
+		# For each chunk
+		$i = 0;
+		while ( $i < count( $revs ) ) {
+			if ( $i < count( $revs ) - $maxChunkSize ) {
+				$thisChunkSize = $maxChunkSize;
+			} else {
+				$thisChunkSize = count( $revs ) - $i;
+			}
+
+			$chunk = new ConcatenatedGzipHistoryBlob();
+			$stubs = array();
+			$dbw->begin();
+			$usedChunk = false;
+			$primaryOldid = $revs[$i]->rev_text_id;
+			
+			# Get the text of each revision and add it to the object
+			for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy( $maxChunkFactor, $factorThreshold ); $j++ ) {
+				$oldid = $revs[$i + $j]->rev_text_id;
+				
+				# Get text
+				if ( $loadStyle == LS_INDIVIDUAL ) {
+					$textRow = $dbw->selectRow( 'text', 
+						array( 'old_flags', 'old_text' ),
+						array( 'old_id' => $oldid ),
+						$fname,
+						'FOR UPDATE'
+					);
+					$text = Revision::getRevisionText( $textRow );
+				} else {
+					$text = Revision::getRevisionText( $revs[$i + $j] );
+				}
+
+				if ( $text === false ) {
+					print "\nError, unable to get text in old_id $oldid\n";
+					#$dbw->delete( 'old', array( 'old_id' => $oldid ) );
+				}
+
+				if ( $j == 0 ) {
+					$chunk->setText( $text );
+					print '.';
+				} else {
+					# Don't make a stub if it's going to be longer than the article
+					# Stubs are typically about 100 bytes
+					if ( strlen( $text ) < 120 ) {
+						$stub = false;
+						print 'x';
+					} else {
+						$stub = $chunk->addItem( $text );
+						$stub->setLocation( $primaryOldid );
+						$hash = $stub->getHash();
+						$stub = serialize( $stub );
+						print '.';
+						$usedChunk = true;
+					}
+					$stubs[$j] = $stub;
+				}
+			}
+			$thisChunkSize = $j;
+			
+			# If we couldn't actually use any stubs because the pages were too small, do nothing
+			if ( $usedChunk ) {
+				# Store the main object
+				$dbw->update( 'text',
+					array( /* SET */
+						'old_text' => serialize( $chunk ),
+						'old_flags' => 'object',
+					), array( /* WHERE */
+						'old_id' => $primaryOldid
+					)
+				);
+
+				# Store the stub objects
+				for ( $j = 1; $j < $thisChunkSize; $j++ ) {
+					# Skip if not compressing
+					if ( $stubs[$j] !== false ) {
+						$dbw->update( 'text',
+							array( /* SET */
+								'old_text' => $stubs[$j],
+								'old_flags' => 'object',
+							), array( /* WHERE */
+								'old_id' => $revs[$i + $j]->rev_text_id
+							)
+						);
+					}
+				}
+			}
+			# Done, next
+			print "/";
+			$dbw->commit();
+			$i += $thisChunkSize;
+		}
+		print "\n";
+	}
+	return true;
+}
+?>
diff --git a/maintenance/storage/compressOld.php b/maintenance/storage/compressOld.php
new file mode 100644
index 0000000000..cde284b788
--- /dev/null
+++ b/maintenance/storage/compressOld.php
@@ -0,0 +1,83 @@
+<?php
+/**
+ * Compress the old table, old_flags=gzip
+ *
+ * @package MediaWiki
+ * @subpackage Maintenance
+ */
+
+/** */
+
+/**
+ * Usage: 
+ *
+ * Non-wikimedia
+ * php compressOld.php [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
+ *
+ * Wikimedia
+ * php compressOld.php <database> [-t <type>] [-c <chunk-size>] [-b <begin-date>] [-e <end-date>] [-s <start-id>]
+ *     [-f <max-factor>] [-h <factor-threshold>]
+ *
+ * <type> is either:
+ *   gzip: compress revisions independently
+ *   concat: concatenate revisions and compress in chunks (default)
+ * 
+ * <start-id> is the old_id to start from
+ * 
+ * The following options apply only to the concat type:
+ *    <begin-date> is the earliest date to check for uncompressed revisions
+ *    <end-date> is the latest revision date to compress
+ *    <chunk-size> is the maximum number of revisions in a concat chunk
+ *    <max-factor> is the maximum ratio of compressed chunk bytes to uncompressed avg. revision bytes
+ *    <factor-threshold> is a minimum number of KB, where <max-factor> cuts in
+ *
+ */
+
+die( 'compressOld is known to be broken at the moment.' );
+
+$optionsWithArgs = array( 't', 'c', 's', 'f', 'h' );
+require_once( "../commandLine.inc" );
+require_once( "compressOld.inc" );
+
+if( !function_exists( "gzdeflate" ) ) {
+	print "You must enable zlib support in PHP to compress old revisions!\n";
+	print "Please see http://www.php.net/manual/en/ref.zlib.php\n\n";
+	die();
+}
+
+$defaults = array( 
+	't' => 'concat',
+	'c' => 20,
+	's' => 0,
+	'f' => 3,
+	'h' => 100,
+	'b' => '',
+	'e' => '',
+);
+
+$args = $args + $defaults;
+
+if ( $args['t'] != 'concat' && $args['t'] != 'gzip' ) {
+	print "Type \"{$args['t']}\" not supported\n";
+}
+
+print "Depending on the size of your database this may take a while!\n";
+print "If you abort the script while it's running it shouldn't harm anything,\n";
+print "but if you haven't backed up your data, you SHOULD abort now!\n\n";
+print "Press control-c to abort first (will proceed automatically in 5 seconds)\n";
+#sleep(5);
+
+$success = true;
+if ( $args['t'] == 'concat' ) {
+	$success = compressWithConcat( $args['s'], $args['c'], $args['f'], $args['h'], $args['b'], $args['e'] );
+} else {
+	compressOldPages( $args['s'] );
+} 
+
+if ( $success ) {
+	print "Done.\n";
+}
+
+exit();
+
+?>