From: Tim Starling Date: Sun, 3 Sep 2006 09:36:05 +0000 (+0000) Subject: support for checkpointing X-Git-Tag: 1.31.0-rc.0~55850 X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/banques/?a=commitdiff_plain;h=33d5d5e2b7e37ca51aa765f2c743c08a66fa5cf6;p=lhc%2Fweb%2Fwiklou.git support for checkpointing --- diff --git a/maintenance/dumpHTML.inc b/maintenance/dumpHTML.inc index 6278f23b0c..2dc08d1383 100644 --- a/maintenance/dumpHTML.inc +++ b/maintenance/dumpHTML.inc @@ -51,34 +51,110 @@ class DumpHTML { # Skin to use var $skin = 'htmldump'; + # Checkpoint stuff + var $checkpointFile = false, $checkpoints = false; + function DumpHTML( $settings ) { foreach ( $settings as $var => $value ) { $this->$var = $value; } } + function loadCheckpoints() { + if ( $this->checkpoints !== false ) { + return true; + } elseif ( !$this->checkpointFile ) { + return false; + } else { + $lines = @file( $this->checkpointFile ); + if ( $lines === false ) { + print "Starting new checkpoint file \"{$this->checkpointFile}\"\n"; + $this->checkpoints = array(); + } else { + $lines = array_map( 'trim', $lines ); + $this->checkpoints = array(); + foreach ( $lines as $line ) { + list( $name, $value ) = explode( '=', $line, 2 ); + $this->checkpoints[$name] = $value; + } + } + return true; + } + } + + function getCheckpoint( $type, $defValue = false ) { + if ( !$this->loadCheckpoints() ) { + return false; + } + if ( !isset( $this->checkpoints[$type] ) ) { + return false; + } else { + return $this->checkpoints[$type]; + } + } + + function setCheckpoint( $type, $value ) { + if ( !$this->checkpointFile ) { + return; + } + $this->checkpoints[$type] = $value; + $blob = ''; + foreach ( $this->checkpoints as $type => $value ) { + $blob .= "$type=$value\n"; + } + file_put_contents( $this->checkpointFile, $blob ); + } + + function doEverything() { + if ( $this->getCheckpoint( 'everything' ) == 'done' ) { + print "Checkpoint says everything is already done\n"; + return; + } + $this->doArticles(); + $this->doLocalImageDescriptions(); + $this->doSharedImageDescriptions(); + $this->doCategories(); + $this->doRedirects(); + $this->doSpecials(); + + $this->setCheckpoint( 'everything', 'done' ); + } + /** * Write a set of articles specified by start and end page_id * Skip categories and images, they will be done separately */ - function doArticles( $start, $end = false ) { + function doArticles() { $fname = 'DumpHTML::doArticles'; + $cp = $this->getCheckpoint( 'article' ); + if ( $cp == 'done' ) { + print "Articles already done\n"; + return; + } elseif ( $cp !== false ) { + print "Resuming article dump from checkpoint at page_id $cp of {$this->endID}\n"; + $start = $cp; + } else { + print "Starting from page_id {$this->startID} of {$this->endID}\n"; + $start = $this->startID; + } + $this->setupGlobals(); - if ( $end === false ) { + if ( $this->endID === false ) { $dbr =& wfGetDB( DB_SLAVE ); - $end = $dbr->selectField( 'page', 'max(page_id)', false, $fname ); + $this->endID = $dbr->selectField( 'page', 'max(page_id)', false, $fname ); } + $mainPageObj = Title::newMainPage(); $mainPage = $mainPageObj->getPrefixedDBkey(); - - for ($id = $start; $id <= $end; $id++) { + for ($id = $start; $id <= $this->endID; $id++) { wfWaitForSlaves( 20 ); if ( !($id % REPORTING_INTERVAL) ) { print "Processing ID: $id\r"; + $this->setCheckpoint( 'article', $id ); } if ( !($id % (REPORTING_INTERVAL*10) ) ) { print "\n"; @@ -91,6 +167,7 @@ class DumpHTML { } } } + $this->setCheckpoint( 'article', 'done' ); print "\n"; } @@ -129,27 +206,45 @@ class DumpHTML { } function doImageDescriptions() { + $this->doLocalImageDescriptions(); + $this->doSharedImageDescriptions(); + } + + /** + * Dump image description pages that don't have an associated article, but do + * have a local image + */ + function doLocalImageDescriptions() { global $wgSharedUploadDirectory; - $fname = 'DumpHTML::doImageDescriptions'; + $dbr =& wfGetDB( DB_SLAVE ); + + $cp = $this->getCheckpoint( 'local image' ); + if ( $cp == 'done' ) { + print "Local image descriptions already done\n"; + return; + } elseif ( $cp !== false ) { + print "Writing image description pages starting from $cp\n"; + $conds = array( 'img_name >= ' . $dbr->addQuotes( $cp ) ); + } else { + print "Writing image description pages for local images\n"; + $conds = false; + } $this->setupGlobals(); - /** - * Dump image description pages that don't have an associated article, but do - * have a local image - */ - $dbr =& wfGetDB( DB_SLAVE ); - extract( $dbr->tableNames( 'image', 'page' ) ); - $res = $dbr->select( 'image', array( 'img_name' ), false, $fname ); + $res = $dbr->select( 'image', array( 'img_name' ), $conds, __METHOD__, + array( 'ORDER BY' => 'img_name' ) ); $i = 0; - print "Writing image description pages for local images\n"; $num = $dbr->numRows( $res ); while ( $row = $dbr->fetchObject( $res ) ) { wfWaitForSlaves( 10 ); if ( !( ++$i % REPORTING_INTERVAL ) ) { print "Done $i of $num\r"; + if ( $row->img_name !== 'done' ) { + $this->setCheckpoint( 'local image', $row->img_name ); + } } $title = Title::makeTitle( NS_IMAGE, $row->img_name ); if ( $title->getArticleID() ) { @@ -158,14 +253,31 @@ class DumpHTML { } $this->doArticle( $title ); } + $this->setCheckpoint( 'local image', 'done' ); print "\n"; + } + + /** + * Dump images which only have a real description page on commons + */ + function doSharedImageDescriptions() { + $cp = $this->getCheckpoint( 'shared image' ); + if ( $cp == 'done' ) { + print "Shared description pages already done\n"; + return; + } elseif ( $cp !== false ) { + print "Writing description pages for commons images starting from directory $cp/255\n"; + $start = $cp; + } else { + print "Writing description pages for commons images\n"; + $start = 0; + } - /** - * Dump images which only have a real description page on commons - */ - print "Writing description pages for commons images\n"; + $this->setupGlobals(); $i = 0; - for ( $hash = 0; $hash < 256; $hash++ ) { + for ( $hash = $start; $hash < 256; $hash++ ) { + $this->setCheckpoint( 'shared image', $hash ); + $dir = sprintf( "%01x/%02x", intval( $hash / 16 ), $hash ); $paths = array_merge( glob( "{$this->sharedStaticDirectory}/$dir/*" ), glob( "{$this->sharedStaticDirectory}/thumb/$dir/*" ) ); @@ -180,16 +292,27 @@ class DumpHTML { $this->doArticle( $title ); } } + $this->setCheckpoint( 'shared image', 'done' ); print "\n"; } function doCategories() { $fname = 'DumpHTML::doCategories'; $this->setupGlobals(); - $dbr =& wfGetDB( DB_SLAVE ); - print "Selecting categories..."; $sql = 'SELECT DISTINCT cl_to FROM ' . $dbr->tableName( 'categorylinks' ); + + $cp = $this->getCheckpoint( 'category' ); + if ( $cp == 'done' ) { + print "Category pages already done\n"; + return; + } elseif ( $cp !== false ) { + print "Resuming category page dump from $cp"; + $sql .= ' WHERE cl_to >= ' . $dbr->addQuotes( $cp ); + } + + $sql .= ' ORDER BY cl_to'; + print "Selecting categories..."; $res = $dbr->query( $sql, $fname ); print "\nWriting " . $dbr->numRows( $res ). " category pages\n"; @@ -198,31 +321,47 @@ class DumpHTML { wfWaitForSlaves( 10 ); if ( !(++$i % REPORTING_INTERVAL ) ) { print "$i\r"; + if ( $row->cl_to != 'done' ) { + $this->setCheckpoint( 'category', $row->cl_to ); + } } $title = Title::makeTitle( NS_CATEGORY, $row->cl_to ); $this->doArticle( $title ); } + $this->setCheckpoint( 'category', 'done' ); print "\n"; } function doRedirects() { print "Doing redirects...\n"; $fname = 'DumpHTML::doRedirects'; + $conds = array( 'page_is_redirect' => 1 ); + + $cp = $this->getCheckpoint( 'redirect' ); + if ( $cp == 'done' ) { + print "Redirects already done\n"; + return; + } elseif ( $cp !== false ) { + print "Resuming redirect generation from page_id $cp\n"; + $conds[] = 'page_id > ' . intval( $cp ); + } + $this->setupGlobals(); $dbr =& wfGetDB( DB_SLAVE ); - - $res = $dbr->select( 'page', array( 'page_namespace', 'page_title' ), - array( 'page_is_redirect' => 1 ), $fname ); + $res = $dbr->select( 'page', array( 'page_id', 'page_namespace', 'page_title' ), + $conds, $fname ); $num = $dbr->numRows( $res ); print "$num redirects to do...\n"; $i = 0; while ( $row = $dbr->fetchObject( $res ) ) { $title = Title::makeTitle( $row->page_namespace, $row->page_title ); - if ( !(++$i % (REPORTING_INTERVAL*10) ) ) { - print "Done $i of $num\n"; - } + if ( !(++$i % (REPORTING_INTERVAL*10) ) ) { + print "Done $i of $num\n"; + $this->setCheckpoint( 'redirect', $row->page_id ); + } $this->doArticle( $title ); } + $this->setCheckpoint( 'redirect', 'done' ); } /** Write an article specified by title */ diff --git a/maintenance/dumpHTML.php b/maintenance/dumpHTML.php index 608893f700..167042f943 100644 --- a/maintenance/dumpHTML.php +++ b/maintenance/dumpHTML.php @@ -9,21 +9,22 @@ * Usage: * php dumpHTML.php [options...] * - * -d destination directory - * -s start ID - * -e end ID - * -k skin to use (defaults to htmldump) - * --images only do image description pages - * --categories only do category pages - * --redirects only do redirects - * --special only do miscellaneous stuff - * --force-copy copy commons instead of symlink, needed for Wikimedia - * --interlang allow interlanguage links - * --image-snapshot copy all images used to the destination directory + * -d destination directory + * -s start ID + * -e end ID + * -k skin to use (defaults to htmldump) + * --checkpoint use a checkpoint file to allow restarting of interrupted dumps + * --images only do image description pages + * --categories only do category pages + * --redirects only do redirects + * --special only do miscellaneous stuff + * --force-copy copy commons instead of symlink, needed for Wikimedia + * --interlang allow interlanguage links + * --image-snapshot copy all images used to the destination directory */ -$optionsWithArgs = array( 's', 'd', 'e', 'k' ); +$optionsWithArgs = array( 's', 'd', 'e', 'k', 'checkpoint' ); $profiling = false; @@ -59,7 +60,7 @@ if ( !empty( $options['e'] ) ) { if ( !empty( $options['d'] ) ) { $dest = $options['d']; } else { - $dest = 'static'; + $dest = "$IP/static"; } $skin = isset( $options['k'] ) ? $options['k'] : 'htmldump'; @@ -71,6 +72,9 @@ $wgHTMLDump = new DumpHTML( array( 'interwiki' => $options['interlang'], 'skin' => $skin, 'makeSnapshot' => $options['image-snapshot'], + 'checkpointFile' => $options['checkpoint'], + 'startID' => $start, + 'endID' => $end )); @@ -83,43 +87,16 @@ if ( $options['special'] ) { } elseif ( $options['redirects'] ) { $wgHTMLDump->doRedirects(); } else { - print("Creating static HTML dump in directory $dest. \n". - "Starting from page_id $start of $end.\n"); - + print "Creating static HTML dump in directory $dest. \n"; $dbr =& wfGetDB( DB_SLAVE ); $server = $dbr->getProperty( 'mServer' ); print "Using database {$server}\n"; - $wgHTMLDump->doArticles( $start, $end ); if ( !isset( $options['e'] ) ) { - $wgHTMLDump->doImageDescriptions(); - $wgHTMLDump->doCategories(); - $wgHTMLDump->doSpecials(); - } - - /* - if ( $end - $start > CHUNK_SIZE * 2 ) { - // Split the problem into smaller chunks, run them in different PHP instances - // This is a memory/resource leak workaround - print("Creating static HTML dump in directory $dest. \n". - "Starting from page_id $start of $end.\n"); - - chdir( "maintenance" ); - for ( $chunkStart = $start; $chunkStart < $end; $chunkStart += CHUNK_SIZE ) { - $chunkEnd = $chunkStart + CHUNK_SIZE - 1; - if ( $chunkEnd > $end ) { - $chunkEnd = $end; - } - passthru( "php dumpHTML.php -d " . wfEscapeShellArg( $dest ) . " -s $chunkStart -e $chunkEnd" ); - } - chdir( ".." ); - $d->doImageDescriptions(); - $d->doCategories(); - $d->doMainPage( $dest ); + $wgHTMLDump->doEverything(); } else { - $d->doArticles( $start, $end ); + $wgHTMLDump->doArticles(); } - */ } if ( isset( $options['debug'] ) ) {