From 4abb45939234fe94ce1a78f3ef171a01b2e40511 Mon Sep 17 00:00:00 2001 From: "This, that and the other" Date: Thu, 31 Dec 2015 20:46:54 +1100 Subject: [PATCH] Refactor dumpBackup.php and dumpTextPass.php to be Maintenance subclasses Use the Maintenance class's new $orderedOptions and support for passing options multiple times. This allows for option "chaining". The BackupDumper and TextPassDumper class now extend Maintenance, but should continue to function as before. The public function processArgs() has been removed and replaced by processOptions(), which takes no parameters. It is unlikely that users of these classes were calling processArgs. Inheritors of these classes that overrode processOption() will now need to override processOptions() and use Maintenance::getOption() and friends. The maintenance/backupTextPass.inc file has been deleted. Users should include maintenance/dumpTextPass.php instead. Bug: T122587 Change-Id: I2473ee119c185d1b2b00ac4b1e70ee8a6cafe4a3 --- RELEASE-NOTES-1.27 | 4 + autoload.php | 5 +- includes/export/DumpDBZip2Output.php | 36 + maintenance/backup.inc | 181 ++-- maintenance/backupTextPass.inc | 925 ----------------- maintenance/dumpBackup.php | 187 ++-- maintenance/dumpTextPass.php | 982 +++++++++++++++++- .../maintenance/backupTextPassTest.php | 26 +- tests/phpunit/maintenance/backup_LogTest.php | 16 +- tests/phpunit/maintenance/backup_PageTest.php | 23 +- 10 files changed, 1245 insertions(+), 1140 deletions(-) create mode 100644 includes/export/DumpDBZip2Output.php delete mode 100644 maintenance/backupTextPass.inc diff --git a/RELEASE-NOTES-1.27 b/RELEASE-NOTES-1.27 index 1b74f52ed4..9860723f6f 100644 --- a/RELEASE-NOTES-1.27 +++ b/RELEASE-NOTES-1.27 @@ -192,6 +192,10 @@ changes to languages because of Phabricator reports. * User::editToken() was removed (deprecated since 1.19). * Removed --force-normal option of dumpBackup.php, as it no longer served any useful purpose since 1.22. +* The functions processOption() and processArgs() on the BackupDumper and + TextPassDumper classes have been removed. +* The maintenance/backupTextPass.inc file was deleted. You should include + maintenance/dumpTextPass.php instead. == Compatibility == diff --git a/autoload.php b/autoload.php index ac38fa5586..92c5436ea0 100644 --- a/autoload.php +++ b/autoload.php @@ -354,7 +354,8 @@ $wgAutoloadLocalClasses = array( 'DummyTermColorer' => __DIR__ . '/maintenance/term/MWTerm.php', 'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php', 'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php', - 'DumpDBZip2Output' => __DIR__ . '/maintenance/backup.inc', + 'DumpBackup' => __DIR__ . '/maintenance/dumpBackup.php', + 'DumpDBZip2Output' => __DIR__ . '/includes/export/DumpDBZip2Output.php', 'DumpFileOutput' => __DIR__ . '/includes/export/DumpFileOutput.php', 'DumpFilter' => __DIR__ . '/includes/export/DumpFilter.php', 'DumpGZipOutput' => __DIR__ . '/includes/export/DumpGZipOutput.php', @@ -1252,7 +1253,7 @@ $wgAutoloadLocalClasses = array( 'TestFileOpPerformance' => __DIR__ . '/maintenance/fileOpPerfTest.php', 'TextContent' => __DIR__ . '/includes/content/TextContent.php', 'TextContentHandler' => __DIR__ . '/includes/content/TextContentHandler.php', - 'TextPassDumper' => __DIR__ . '/maintenance/backupTextPass.inc', + 'TextPassDumper' => __DIR__ . '/maintenance/dumpTextPass.php', 'TextStatsOutput' => __DIR__ . '/maintenance/language/StatOutputs.php', 'TgConverter' => __DIR__ . '/languages/classes/LanguageTg.php', 'ThrottledError' => __DIR__ . '/includes/exception/ThrottledError.php', diff --git a/includes/export/DumpDBZip2Output.php b/includes/export/DumpDBZip2Output.php new file mode 100644 index 0000000000..5edde8f745 --- /dev/null +++ b/includes/export/DumpDBZip2Output.php @@ -0,0 +1,36 @@ + + * https://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @file + */ + +/** + * @ingroup Dump + */ +class DumpDBZip2Output extends DumpPipeOutput { + /** + * @param string $file + */ + function __construct( $file ) { + parent::__construct( "dbzip2", $file ); + } +} diff --git a/maintenance/backup.inc b/maintenance/backup.inc index 93010ae89b..ec59c601c9 100644 --- a/maintenance/backup.inc +++ b/maintenance/backup.inc @@ -24,19 +24,13 @@ * @ingroup Dump Maintenance */ -/** - * @ingroup Dump Maintenance - */ -class DumpDBZip2Output extends DumpPipeOutput { - function __construct( $file ) { - parent::__construct( "dbzip2", $file ); - } -} +require_once __DIR__ . '/Maintenance.php'; +require_once __DIR__ . '/../includes/export/DumpFilter.php'; /** * @ingroup Dump Maintenance */ -class BackupDumper { +class BackupDumper extends Maintenance { public $reporting = true; public $pages = null; // all pages public $skipHeader = false; // don't output and @@ -67,7 +61,7 @@ class BackupDumper { * * @var DatabaseBase|null * - * @see self::setDb + * @see self::setDB */ protected $forcedDb = null; @@ -77,7 +71,11 @@ class BackupDumper { // @todo Unused? private $stubText = false; // include rev_text_id instead of text; for 2-pass dump - function __construct( $args ) { + /** + * @param array $args For backward compatibility + */ + function __construct( $args = null ) { + parent::__construct(); $this->stderr = fopen( "php://stderr", "wt" ); // Built-in output and filter plugins @@ -91,7 +89,23 @@ class BackupDumper { $this->registerFilter( 'notalk', 'DumpNotalkFilter' ); $this->registerFilter( 'namespace', 'DumpNamespaceFilter' ); - $this->sink = $this->processArgs( $args ); + // These three can be specified multiple times + $this->addOption( 'plugin', 'Load a dump plugin class. Specify as [:].', + false, true, false, true ); + $this->addOption( 'output', 'Begin a filtered output stream; Specify as :. ' . + 's: file, gzip, bzip2, 7zip, dbzip2', false, true, false, true ); + $this->addOption( 'filter', 'Add a filter on an output branch. Specify as ' . + '[:]. s: latest, notalk, namespace', false, true, false, true ); + $this->addOption( 'report', 'Report position and speed after every n pages processed. ' . + 'Default: 100.', false, true ); + $this->addOption( 'server', 'Force reading from MySQL server', false, true ); + + if ( $args ) { + // Args should be loaded and processed so that dump() can be called directly + // instead of execute() + $this->loadWithArgv( $args ); + $this->processOptions(); + } } /** @@ -125,77 +139,102 @@ class BackupDumper { call_user_func_array( $register, array( &$this ) ); } + function execute() { + throw new MWException( 'execute() must be overridden in subclasses' ); + } + /** - * @param array $args - * @return array + * Processes arguments and sets $this->$sink accordingly */ - function processArgs( $args ) { + function processOptions() { $sink = null; $sinks = array(); - foreach ( $args as $arg ) { - $matches = array(); - if ( preg_match( '/^--(.+?)(?:=(.+?)(?::(.+?))?)?$/', $arg, $matches ) ) { - MediaWiki\suppressWarnings(); - list( /* $full */, $opt, $val, $param ) = $matches; - MediaWiki\restoreWarnings(); - - switch ( $opt ) { - case "plugin": - $this->loadPlugin( $val, $param ); - break; - case "output": - if ( !is_null( $sink ) ) { - $sinks[] = $sink; - } - if ( !isset( $this->outputTypes[$val] ) ) { - $this->fatalError( "Unrecognized output sink type '$val'" ); - } - $type = $this->outputTypes[$val]; - $sink = new $type( $param ); - break; - case "filter": - if ( is_null( $sink ) ) { - $sink = new DumpOutput(); - } - if ( !isset( $this->filterTypes[$val] ) ) { - $this->fatalError( "Unrecognized filter type '$val'" ); - } - $type = $this->filterTypes[$val]; - $filter = new $type( $sink, $param ); - - // references are lame in php... - unset( $sink ); - $sink = $filter; - - break; - case "report": - $this->reportingInterval = intval( $val ); - break; - case "server": - $this->server = $val; - break; - default: - $this->processOption( $opt, $val, $param ); - } + + $options = $this->orderedOptions; + foreach ( $options as $arg ) { + $opt = $arg[0]; + $param = $arg[1]; + + switch ( $opt ) { + case 'plugin': + $val = explode( ':', $param ); + + if ( count( $val ) === 1 ) { + $this->loadPlugin( $val[0] ); + } elseif ( count( $val ) === 2 ) { + $this->loadPlugin( $val[0], $val[1] ); + } else { + $this->fatalError( 'Invalid plugin parameter' ); + return; + } + + break; + case 'output': + $split = explode( ':', $param, 2 ); + if ( count( $split ) !== 2 ) { + $this->fatalError( 'Invalid output parameter' ); + } + list( $type, $file ) = $split; + if ( !is_null( $sink ) ) { + $sinks[] = $sink; + } + if ( !isset( $this->outputTypes[$type] ) ) { + $this->fatalError( "Unrecognized output sink type '$type'" ); + } + $class = $this->outputTypes[$type]; + $sink = new $class( $file ); + + break; + case 'filter': + if ( is_null( $sink ) ) { + $sink = new DumpOutput(); + } + + $split = explode( ':', $param ); + $key = $split[0]; + + if ( !isset( $this->filterTypes[$key] ) ) { + $this->fatalError( "Unrecognized filter type '$key'" ); + } + + $type = $this->filterTypes[$key]; + + if ( count( $split ) === 1 ) { + $filter = new $type( $sink ); + } elseif ( count( $split ) === 2 ) { + $filter = new $type( $sink, $split[1] ); + } else { + $this->fatalError( 'Invalid filter parameter' ); + } + + // references are lame in php... + unset( $sink ); + $sink = $filter; + + break; } } + if ( $this->hasOption( 'report' ) ) { + $this->reportingInterval = intval( $this->getOption( 'report' ) ); + } + + if ( $this->hasOption( 'server' ) ) { + $this->server = $this->getOption( 'server' ); + } + if ( is_null( $sink ) ) { $sink = new DumpOutput(); } $sinks[] = $sink; if ( count( $sinks ) > 1 ) { - return new DumpMultiWriter( $sinks ); + $this->sink = new DumpMultiWriter( $sinks ); } else { - return $sink; + $this->sink = $sink; } } - function processOption( $opt, $val, $param ) { - // extension point for subclasses to add options - } - function dump( $history, $text = WikiExporter::TEXT ) { # Notice messages will foul up your XML output even if they're # relatively harmless. @@ -292,7 +331,8 @@ class BackupDumper { * @param DatabaseBase|null $db (Optional) the database connection to use. If null, resort to * use the globally provided ways to get database connections. */ - function setDb( DatabaseBase $db = null ) { + function setDB( IDatabase $db = null ) { + parent::setDB( $db ); $this->forcedDb = $db; } @@ -365,12 +405,13 @@ class BackupDumper { } function progress( $string ) { - fwrite( $this->stderr, $string . "\n" ); + if ( $this->reporting ) { + fwrite( $this->stderr, $string . "\n" ); + } } function fatalError( $msg ) { - $this->progress( "$msg\n" ); - die( 1 ); + $this->error( "$msg\n", 1 ); } } diff --git a/maintenance/backupTextPass.inc b/maintenance/backupTextPass.inc deleted file mode 100644 index 0562333225..0000000000 --- a/maintenance/backupTextPass.inc +++ /dev/null @@ -1,925 +0,0 @@ - - * https://www.mediawiki.org/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * http://www.gnu.org/copyleft/gpl.html - * - * @file - * @ingroup Maintenance - */ - -require_once __DIR__ . '/backup.inc'; - -/** - * @ingroup Maintenance - */ -class TextPassDumper extends BackupDumper { - public $prefetch = null; - - // when we spend more than maxTimeAllowed seconds on this run, we continue - // processing until we write out the next complete page, then save output file(s), - // rename it/them and open new one(s) - public $maxTimeAllowed = 0; // 0 = no limit - - protected $input = "php://stdin"; - protected $history = WikiExporter::FULL; - protected $fetchCount = 0; - protected $prefetchCount = 0; - protected $prefetchCountLast = 0; - protected $fetchCountLast = 0; - - protected $maxFailures = 5; - protected $maxConsecutiveFailedTextRetrievals = 200; - protected $failureTimeout = 5; // Seconds to sleep after db failure - - protected $bufferSize = 524288; // In bytes. Maximum size to read from the stub in on go. - - protected $php = "php"; - protected $spawn = false; - - /** - * @var bool|resource - */ - protected $spawnProc = false; - - /** - * @var bool|resource - */ - protected $spawnWrite = false; - - /** - * @var bool|resource - */ - protected $spawnRead = false; - - /** - * @var bool|resource - */ - protected $spawnErr = false; - - protected $xmlwriterobj = false; - - protected $timeExceeded = false; - protected $firstPageWritten = false; - protected $lastPageWritten = false; - protected $checkpointJustWritten = false; - protected $checkpointFiles = array(); - - /** - * @var DatabaseBase - */ - protected $db; - - /** - * Drop the database connection $this->db and try to get a new one. - * - * This function tries to get a /different/ connection if this is - * possible. Hence, (if this is possible) it switches to a different - * failover upon each call. - * - * This function resets $this->lb and closes all connections on it. - * - * @throws MWException - */ - function rotateDb() { - // Cleaning up old connections - if ( isset( $this->lb ) ) { - $this->lb->closeAll(); - unset( $this->lb ); - } - - if ( $this->forcedDb !== null ) { - $this->db = $this->forcedDb; - - return; - } - - if ( isset( $this->db ) && $this->db->isOpen() ) { - throw new MWException( 'DB is set and has not been closed by the Load Balancer' ); - } - - unset( $this->db ); - - // Trying to set up new connection. - // We do /not/ retry upon failure, but delegate to encapsulating logic, to avoid - // individually retrying at different layers of code. - - // 1. The LoadBalancer. - try { - $this->lb = wfGetLBFactory()->newMainLB(); - } catch ( Exception $e ) { - throw new MWException( __METHOD__ - . " rotating DB failed to obtain new load balancer (" . $e->getMessage() . ")" ); - } - - // 2. The Connection, through the load balancer. - try { - $this->db = $this->lb->getConnection( DB_SLAVE, 'dump' ); - } catch ( Exception $e ) { - throw new MWException( __METHOD__ - . " rotating DB failed to obtain new database (" . $e->getMessage() . ")" ); - } - } - - function initProgress( $history = WikiExporter::FULL ) { - parent::initProgress(); - $this->timeOfCheckpoint = $this->startTime; - } - - function dump( $history, $text = WikiExporter::TEXT ) { - // Notice messages will foul up your XML output even if they're - // relatively harmless. - if ( ini_get( 'display_errors' ) ) { - ini_set( 'display_errors', 'stderr' ); - } - - $this->initProgress( $this->history ); - - // We are trying to get an initial database connection to avoid that the - // first try of this request's first call to getText fails. However, if - // obtaining a good DB connection fails it's not a serious issue, as - // getText does retry upon failure and can start without having a working - // DB connection. - try { - $this->rotateDb(); - } catch ( Exception $e ) { - // We do not even count this as failure. Just let eventual - // watchdogs know. - $this->progress( "Getting initial DB connection failed (" . - $e->getMessage() . ")" ); - } - - $this->egress = new ExportProgressFilter( $this->sink, $this ); - - // it would be nice to do it in the constructor, oh well. need egress set - $this->finalOptionCheck(); - - // we only want this so we know how to close a stream :-P - $this->xmlwriterobj = new XmlDumpWriter(); - - $input = fopen( $this->input, "rt" ); - $this->readDump( $input ); - - if ( $this->spawnProc ) { - $this->closeSpawn(); - } - - $this->report( true ); - } - - function processOption( $opt, $val, $param ) { - global $IP; - $url = $this->processFileOpt( $val, $param ); - - switch ( $opt ) { - case 'buffersize': - // Lower bound for xml reading buffer size is 4 KB - $this->bufferSize = max( intval( $val ), 4 * 1024 ); - break; - case 'prefetch': - require_once "$IP/maintenance/backupPrefetch.inc"; - $this->prefetch = new BaseDump( $url ); - break; - case 'stub': - $this->input = $url; - break; - case 'maxtime': - $this->maxTimeAllowed = intval( $val ) * 60; - break; - case 'checkpointfile': - $this->checkpointFiles[] = $val; - break; - case 'current': - $this->history = WikiExporter::CURRENT; - break; - case 'full': - $this->history = WikiExporter::FULL; - break; - case 'spawn': - $this->spawn = true; - if ( $val ) { - $this->php = $val; - } - break; - } - } - - function processFileOpt( $val, $param ) { - $fileURIs = explode( ';', $param ); - foreach ( $fileURIs as $URI ) { - switch ( $val ) { - case "file": - $newURI = $URI; - break; - case "gzip": - $newURI = "compress.zlib://$URI"; - break; - case "bzip2": - $newURI = "compress.bzip2://$URI"; - break; - case "7zip": - $newURI = "mediawiki.compress.7z://$URI"; - break; - default: - $newURI = $URI; - } - $newFileURIs[] = $newURI; - } - $val = implode( ';', $newFileURIs ); - - return $val; - } - - /** - * Overridden to include prefetch ratio if enabled. - */ - function showReport() { - if ( !$this->prefetch ) { - parent::showReport(); - - return; - } - - if ( $this->reporting ) { - $now = wfTimestamp( TS_DB ); - $nowts = microtime( true ); - $deltaAll = $nowts - $this->startTime; - $deltaPart = $nowts - $this->lastTime; - $this->pageCountPart = $this->pageCount - $this->pageCountLast; - $this->revCountPart = $this->revCount - $this->revCountLast; - - if ( $deltaAll ) { - $portion = $this->revCount / $this->maxCount; - $eta = $this->startTime + $deltaAll / $portion; - $etats = wfTimestamp( TS_DB, intval( $eta ) ); - if ( $this->fetchCount ) { - $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount; - } else { - $fetchRate = '-'; - } - $pageRate = $this->pageCount / $deltaAll; - $revRate = $this->revCount / $deltaAll; - } else { - $pageRate = '-'; - $revRate = '-'; - $etats = '-'; - $fetchRate = '-'; - } - if ( $deltaPart ) { - if ( $this->fetchCountLast ) { - $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast; - } else { - $fetchRatePart = '-'; - } - $pageRatePart = $this->pageCountPart / $deltaPart; - $revRatePart = $this->revCountPart / $deltaPart; - } else { - $fetchRatePart = '-'; - $pageRatePart = '-'; - $revRatePart = '-'; - } - $this->progress( sprintf( - "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), " - . "%d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% " - . "prefetched (all|curr), ETA %s [max %d]", - $now, wfWikiID(), $this->ID, $this->pageCount, $pageRate, - $pageRatePart, $this->revCount, $revRate, $revRatePart, - $fetchRate, $fetchRatePart, $etats, $this->maxCount - ) ); - $this->lastTime = $nowts; - $this->revCountLast = $this->revCount; - $this->prefetchCountLast = $this->prefetchCount; - $this->fetchCountLast = $this->fetchCount; - } - } - - function setTimeExceeded() { - $this->timeExceeded = true; - } - - function checkIfTimeExceeded() { - if ( $this->maxTimeAllowed - && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) - ) { - return true; - } - - return false; - } - - function finalOptionCheck() { - if ( ( $this->checkpointFiles && !$this->maxTimeAllowed ) - || ( $this->maxTimeAllowed && !$this->checkpointFiles ) - ) { - throw new MWException( "Options checkpointfile and maxtime must be specified together.\n" ); - } - foreach ( $this->checkpointFiles as $checkpointFile ) { - $count = substr_count( $checkpointFile, "%s" ); - if ( $count != 2 ) { - throw new MWException( "Option checkpointfile must contain two '%s' " - . "for substitution of first and last pageids, count is $count instead, " - . "file is $checkpointFile.\n" ); - } - } - - if ( $this->checkpointFiles ) { - $filenameList = (array)$this->egress->getFilenames(); - if ( count( $filenameList ) != count( $this->checkpointFiles ) ) { - throw new MWException( "One checkpointfile must be specified " - . "for each output option, if maxtime is used.\n" ); - } - } - } - - /** - * @throws MWException Failure to parse XML input - * @param string $input - * @return bool - */ - function readDump( $input ) { - $this->buffer = ""; - $this->openElement = false; - $this->atStart = true; - $this->state = ""; - $this->lastName = ""; - $this->thisPage = 0; - $this->thisRev = 0; - $this->thisRevModel = null; - $this->thisRevFormat = null; - - $parser = xml_parser_create( "UTF-8" ); - xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); - - xml_set_element_handler( - $parser, - array( &$this, 'startElement' ), - array( &$this, 'endElement' ) - ); - xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); - - $offset = 0; // for context extraction on error reporting - do { - if ( $this->checkIfTimeExceeded() ) { - $this->setTimeExceeded(); - } - $chunk = fread( $input, $this->bufferSize ); - if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { - wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); - - $byte = xml_get_current_byte_index( $parser ); - $msg = wfMessage( 'xml-error-string', - 'XML import parse failure', - xml_get_current_line_number( $parser ), - xml_get_current_column_number( $parser ), - $byte . ( is_null( $chunk ) ? null : ( '; "' . substr( $chunk, $byte - $offset, 16 ) . '"' ) ), - xml_error_string( xml_get_error_code( $parser ) ) )->escaped(); - - xml_parser_free( $parser ); - - throw new MWException( $msg ); - } - $offset += strlen( $chunk ); - } while ( $chunk !== false && !feof( $input ) ); - if ( $this->maxTimeAllowed ) { - $filenameList = (array)$this->egress->getFilenames(); - // we wrote some stuff after last checkpoint that needs renamed - if ( file_exists( $filenameList[0] ) ) { - $newFilenames = array(); - # we might have just written the header and footer and had no - # pages or revisions written... perhaps they were all deleted - # there's no pageID 0 so we use that. the caller is responsible - # for deciding what to do with a file containing only the - # siteinfo information and the mw tags. - if ( !$this->firstPageWritten ) { - $firstPageID = str_pad( 0, 9, "0", STR_PAD_LEFT ); - $lastPageID = str_pad( 0, 9, "0", STR_PAD_LEFT ); - } else { - $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT ); - $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT ); - } - - $filenameCount = count( $filenameList ); - for ( $i = 0; $i < $filenameCount; $i++ ) { - $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID ); - $fileinfo = pathinfo( $filenameList[$i] ); - $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn; - } - $this->egress->closeAndRename( $newFilenames ); - } - } - xml_parser_free( $parser ); - - return true; - } - - /** - * Applies applicable export transformations to $text. - * - * @param string $text - * @param string $model - * @param string|null $format - * - * @return string - */ - private function exportTransform( $text, $model, $format = null ) { - try { - $handler = ContentHandler::getForModelID( $model ); - $text = $handler->exportTransform( $text, $format ); - } - catch ( MWException $ex ) { - $this->progress( - "Unable to apply export transformation for content model '$model': " . - $ex->getMessage() - ); - } - - return $text; - } - - /** - * Tries to get the revision text for a revision id. - * Export transformations are applied if the content model can is given or can be - * determined from the database. - * - * Upon errors, retries (Up to $this->maxFailures tries each call). - * If still no good revision get could be found even after this retrying, "" is returned. - * If no good revision text could be returned for - * $this->maxConsecutiveFailedTextRetrievals consecutive calls to getText, MWException - * is thrown. - * - * @param string $id The revision id to get the text for - * @param string|bool|null $model The content model used to determine - * applicable export transformations. - * If $model is null, it will be determined from the database. - * @param string|null $format The content format used when applying export transformations. - * - * @throws MWException - * @return string The revision text for $id, or "" - */ - function getText( $id, $model = null, $format = null ) { - global $wgContentHandlerUseDB; - - $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch. - $text = false; // The candidate for a good text. false if no proper value. - $failures = 0; // The number of times, this invocation of getText already failed. - - // The number of times getText failed without yielding a good text in between. - static $consecutiveFailedTextRetrievals = 0; - - $this->fetchCount++; - - // To allow to simply return on success and do not have to worry about book keeping, - // we assume, this fetch works (possible after some retries). Nevertheless, we koop - // the old value, so we can restore it, if problems occur (See after the while loop). - $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals; - $consecutiveFailedTextRetrievals = 0; - - if ( $model === null && $wgContentHandlerUseDB ) { - $row = $this->db->selectRow( - 'revision', - array( 'rev_content_model', 'rev_content_format' ), - array( 'rev_id' => $this->thisRev ), - __METHOD__ - ); - - if ( $row ) { - $model = $row->rev_content_model; - $format = $row->rev_content_format; - } - } - - if ( $model === null || $model === '' ) { - $model = false; - } - - while ( $failures < $this->maxFailures ) { - - // As soon as we found a good text for the $id, we will return immediately. - // Hence, if we make it past the try catch block, we know that we did not - // find a good text. - - try { - // Step 1: Get some text (or reuse from previous iteratuon if checking - // for plausibility failed) - - // Trying to get prefetch, if it has not been tried before - if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) { - $prefetchNotTried = false; - $tryIsPrefetch = true; - $text = $this->prefetch->prefetch( intval( $this->thisPage ), - intval( $this->thisRev ) ); - - if ( $text === null ) { - $text = false; - } - - if ( is_string( $text ) && $model !== false ) { - // Apply export transformation to text coming from an old dump. - // The purpose of this transformation is to convert up from legacy - // formats, which may still be used in the older dump that is used - // for pre-fetching. Applying the transformation again should not - // interfere with content that is already in the correct form. - $text = $this->exportTransform( $text, $model, $format ); - } - } - - if ( $text === false ) { - // Fallback to asking the database - $tryIsPrefetch = false; - if ( $this->spawn ) { - $text = $this->getTextSpawned( $id ); - } else { - $text = $this->getTextDb( $id ); - } - - if ( $text !== false && $model !== false ) { - // Apply export transformation to text coming from the database. - // Prefetched text should already have transformations applied. - $text = $this->exportTransform( $text, $model, $format ); - } - - // No more checks for texts from DB for now. - // If we received something that is not false, - // We treat it as good text, regardless of whether it actually is or is not - if ( $text !== false ) { - return $text; - } - } - - if ( $text === false ) { - throw new MWException( "Generic error while obtaining text for id " . $id ); - } - - // We received a good candidate for the text of $id via some method - - // Step 2: Checking for plausibility and return the text if it is - // plausible - $revID = intval( $this->thisRev ); - if ( !isset( $this->db ) ) { - throw new MWException( "No database available" ); - } - - if ( $model !== CONTENT_MODEL_WIKITEXT ) { - $revLength = strlen( $text ); - } else { - $revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) ); - } - - if ( strlen( $text ) == $revLength ) { - if ( $tryIsPrefetch ) { - $this->prefetchCount++; - } - - return $text; - } - - $text = false; - throw new MWException( "Received text is unplausible for id " . $id ); - } catch ( Exception $e ) { - $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")"; - if ( $failures + 1 < $this->maxFailures ) { - $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)"; - } - $this->progress( $msg ); - } - - // Something went wrong; we did not a text that was plausible :( - $failures++; - - // A failure in a prefetch hit does not warrant resetting db connection etc. - if ( !$tryIsPrefetch ) { - // After backing off for some time, we try to reboot the whole process as - // much as possible to not carry over failures from one part to the other - // parts - sleep( $this->failureTimeout ); - try { - $this->rotateDb(); - if ( $this->spawn ) { - $this->closeSpawn(); - $this->openSpawn(); - } - } catch ( Exception $e ) { - $this->progress( "Rebooting getText infrastructure failed (" . $e->getMessage() . ")" . - " Trying to continue anyways" ); - } - } - } - - // Retirieving a good text for $id failed (at least) maxFailures times. - // We abort for this $id. - - // Restoring the consecutive failures, and maybe aborting, if the dump - // is too broken. - $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1; - if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) { - throw new MWException( "Graceful storage failure" ); - } - - return ""; - } - - /** - * May throw a database error if, say, the server dies during query. - * @param int $id - * @return bool|string - * @throws MWException - */ - private function getTextDb( $id ) { - global $wgContLang; - if ( !isset( $this->db ) ) { - throw new MWException( __METHOD__ . "No database available" ); - } - $row = $this->db->selectRow( 'text', - array( 'old_text', 'old_flags' ), - array( 'old_id' => $id ), - __METHOD__ ); - $text = Revision::getRevisionText( $row ); - if ( $text === false ) { - return false; - } - $stripped = str_replace( "\r", "", $text ); - $normalized = $wgContLang->normalize( $stripped ); - - return $normalized; - } - - private function getTextSpawned( $id ) { - MediaWiki\suppressWarnings(); - if ( !$this->spawnProc ) { - // First time? - $this->openSpawn(); - } - $text = $this->getTextSpawnedOnce( $id ); - MediaWiki\restoreWarnings(); - - return $text; - } - - function openSpawn() { - global $IP; - - if ( file_exists( "$IP/../multiversion/MWScript.php" ) ) { - $cmd = implode( " ", - array_map( 'wfEscapeShellArg', - array( - $this->php, - "$IP/../multiversion/MWScript.php", - "fetchText.php", - '--wiki', wfWikiID() ) ) ); - } else { - $cmd = implode( " ", - array_map( 'wfEscapeShellArg', - array( - $this->php, - "$IP/maintenance/fetchText.php", - '--wiki', wfWikiID() ) ) ); - } - $spec = array( - 0 => array( "pipe", "r" ), - 1 => array( "pipe", "w" ), - 2 => array( "file", "/dev/null", "a" ) ); - $pipes = array(); - - $this->progress( "Spawning database subprocess: $cmd" ); - $this->spawnProc = proc_open( $cmd, $spec, $pipes ); - if ( !$this->spawnProc ) { - $this->progress( "Subprocess spawn failed." ); - - return false; - } - list( - $this->spawnWrite, // -> stdin - $this->spawnRead, // <- stdout - ) = $pipes; - - return true; - } - - private function closeSpawn() { - MediaWiki\suppressWarnings(); - if ( $this->spawnRead ) { - fclose( $this->spawnRead ); - } - $this->spawnRead = false; - if ( $this->spawnWrite ) { - fclose( $this->spawnWrite ); - } - $this->spawnWrite = false; - if ( $this->spawnErr ) { - fclose( $this->spawnErr ); - } - $this->spawnErr = false; - if ( $this->spawnProc ) { - pclose( $this->spawnProc ); - } - $this->spawnProc = false; - MediaWiki\restoreWarnings(); - } - - private function getTextSpawnedOnce( $id ) { - global $wgContLang; - - $ok = fwrite( $this->spawnWrite, "$id\n" ); - // $this->progress( ">> $id" ); - if ( !$ok ) { - return false; - } - - $ok = fflush( $this->spawnWrite ); - // $this->progress( ">> [flush]" ); - if ( !$ok ) { - return false; - } - - // check that the text id they are sending is the one we asked for - // this avoids out of sync revision text errors we have encountered in the past - $newId = fgets( $this->spawnRead ); - if ( $newId === false ) { - return false; - } - if ( $id != intval( $newId ) ) { - return false; - } - - $len = fgets( $this->spawnRead ); - // $this->progress( "<< " . trim( $len ) ); - if ( $len === false ) { - return false; - } - - $nbytes = intval( $len ); - // actual error, not zero-length text - if ( $nbytes < 0 ) { - return false; - } - - $text = ""; - - // Subprocess may not send everything at once, we have to loop. - while ( $nbytes > strlen( $text ) ) { - $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) ); - if ( $buffer === false ) { - break; - } - $text .= $buffer; - } - - $gotbytes = strlen( $text ); - if ( $gotbytes != $nbytes ) { - $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " ); - - return false; - } - - // Do normalization in the dump thread... - $stripped = str_replace( "\r", "", $text ); - $normalized = $wgContLang->normalize( $stripped ); - - return $normalized; - } - - function startElement( $parser, $name, $attribs ) { - $this->checkpointJustWritten = false; - - $this->clearOpenElement( null ); - $this->lastName = $name; - - if ( $name == 'revision' ) { - $this->state = $name; - $this->egress->writeOpenPage( null, $this->buffer ); - $this->buffer = ""; - } elseif ( $name == 'page' ) { - $this->state = $name; - if ( $this->atStart ) { - $this->egress->writeOpenStream( $this->buffer ); - $this->buffer = ""; - $this->atStart = false; - } - } - - if ( $name == "text" && isset( $attribs['id'] ) ) { - $id = $attribs['id']; - $model = trim( $this->thisRevModel ); - $format = trim( $this->thisRevFormat ); - - $model = $model === '' ? null : $model; - $format = $format === '' ? null : $format; - - $text = $this->getText( $id, $model, $format ); - $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); - if ( strlen( $text ) > 0 ) { - $this->characterData( $parser, $text ); - } - } else { - $this->openElement = array( $name, $attribs ); - } - } - - function endElement( $parser, $name ) { - $this->checkpointJustWritten = false; - - if ( $this->openElement ) { - $this->clearOpenElement( "" ); - } else { - $this->buffer .= ""; - } - - if ( $name == 'revision' ) { - $this->egress->writeRevision( null, $this->buffer ); - $this->buffer = ""; - $this->thisRev = ""; - $this->thisRevModel = null; - $this->thisRevFormat = null; - } elseif ( $name == 'page' ) { - if ( !$this->firstPageWritten ) { - $this->firstPageWritten = trim( $this->thisPage ); - } - $this->lastPageWritten = trim( $this->thisPage ); - if ( $this->timeExceeded ) { - $this->egress->writeClosePage( $this->buffer ); - // nasty hack, we can't just write the chardata after the - // page tag, it will include leading blanks from the next line - $this->egress->sink->write( "\n" ); - - $this->buffer = $this->xmlwriterobj->closeStream(); - $this->egress->writeCloseStream( $this->buffer ); - - $this->buffer = ""; - $this->thisPage = ""; - // this could be more than one file if we had more than one output arg - - $filenameList = (array)$this->egress->getFilenames(); - $newFilenames = array(); - $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT ); - $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT ); - $filenamesCount = count( $filenameList ); - for ( $i = 0; $i < $filenamesCount; $i++ ) { - $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID ); - $fileinfo = pathinfo( $filenameList[$i] ); - $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn; - } - $this->egress->closeRenameAndReopen( $newFilenames ); - $this->buffer = $this->xmlwriterobj->openStream(); - $this->timeExceeded = false; - $this->timeOfCheckpoint = $this->lastTime; - $this->firstPageWritten = false; - $this->checkpointJustWritten = true; - } else { - $this->egress->writeClosePage( $this->buffer ); - $this->buffer = ""; - $this->thisPage = ""; - } - } elseif ( $name == 'mediawiki' ) { - $this->egress->writeCloseStream( $this->buffer ); - $this->buffer = ""; - } - } - - function characterData( $parser, $data ) { - $this->clearOpenElement( null ); - if ( $this->lastName == "id" ) { - if ( $this->state == "revision" ) { - $this->thisRev .= $data; - } elseif ( $this->state == "page" ) { - $this->thisPage .= $data; - } - } elseif ( $this->lastName == "model" ) { - $this->thisRevModel .= $data; - } elseif ( $this->lastName == "format" ) { - $this->thisRevFormat .= $data; - } - - // have to skip the newline left over from closepagetag line of - // end of checkpoint files. nasty hack!! - if ( $this->checkpointJustWritten ) { - if ( $data[0] == "\n" ) { - $data = substr( $data, 1 ); - } - $this->checkpointJustWritten = false; - } - $this->buffer .= htmlspecialchars( $data ); - } - - function clearOpenElement( $style ) { - if ( $this->openElement ) { - $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); - $this->openElement = false; - } - } -} diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php index 18c78dcda7..6b5792a7c8 100644 --- a/maintenance/dumpBackup.php +++ b/maintenance/dumpBackup.php @@ -25,107 +25,110 @@ * @ingroup Dump Maintenance */ -$originalDir = getcwd(); - -$optionsWithArgs = array( 'pagelist', 'start', 'end', 'revstart', 'revend' ); - -require_once __DIR__ . '/commandLine.inc'; require_once __DIR__ . '/backup.inc'; -$dumper = new BackupDumper( $argv ); +class DumpBackup extends BackupDumper { + function __construct( $args = null ) { + parent::__construct(); -if ( isset( $options['quiet'] ) ) { - $dumper->reporting = false; -} - -if ( isset( $options['pagelist'] ) ) { - $olddir = getcwd(); - chdir( $originalDir ); - $pages = file( $options['pagelist'] ); - chdir( $olddir ); - if ( $pages === false ) { - echo "Unable to open file {$options['pagelist']}\n"; - die( 1 ); - } - $pages = array_map( 'trim', $pages ); - $dumper->pages = array_filter( $pages, create_function( '$x', 'return $x !== "";' ) ); -} - -if ( isset( $options['start'] ) ) { - $dumper->startId = intval( $options['start'] ); -} -if ( isset( $options['end'] ) ) { - $dumper->endId = intval( $options['end'] ); -} - -if ( isset( $options['revstart'] ) ) { - $dumper->revStartId = intval( $options['revstart'] ); -} -if ( isset( $options['revend'] ) ) { - $dumper->revEndId = intval( $options['revend'] ); -} -$dumper->skipHeader = isset( $options['skip-header'] ); -$dumper->skipFooter = isset( $options['skip-footer'] ); -$dumper->dumpUploads = isset( $options['uploads'] ); -$dumper->dumpUploadFileContents = isset( $options['include-files'] ); - -$textMode = isset( $options['stub'] ) ? WikiExporter::STUB : WikiExporter::TEXT; - -if ( isset( $options['full'] ) ) { - $dumper->dump( WikiExporter::FULL, $textMode ); -} elseif ( isset( $options['current'] ) ) { - $dumper->dump( WikiExporter::CURRENT, $textMode ); -} elseif ( isset( $options['stable'] ) ) { - $dumper->dump( WikiExporter::STABLE, $textMode ); -} elseif ( isset( $options['logs'] ) ) { - $dumper->dump( WikiExporter::LOGS ); -} elseif ( isset( $options['revrange'] ) ) { - $dumper->dump( WikiExporter::RANGE, $textMode ); -} else { - $dumper->progress( <<mDescription = <<stderr = fopen( "php://stderr", "wt" ); + // Actions + $this->addOption( 'full', 'Dump all revisions of every page' ); + $this->addOption( 'current', 'Dump only the latest revision of every page.' ); + $this->addOption( 'logs', 'Dump all log events' ); + $this->addOption( 'stable', 'Dump stable versions of pages' ); + $this->addOption( 'revrange', 'Dump range of revisions specified by revstart and ' . + 'revend parameters' ); + $this->addOption( 'pagelist', + 'Dump only pages included in the file', false, true ); + // Options + $this->addOption( 'start', 'Start from page_id or log_id', false, true ); + $this->addOption( 'end', 'Stop before page_id or log_id n (exclusive)', false, true ); + $this->addOption( 'revstart', 'Start from rev_id', false, true ); + $this->addOption( 'revend', 'Stop before rev_id n (exclusive)', false, true ); + $this->addOption( 'skip-header', 'Don\'t output the header' ); + $this->addOption( 'skip-footer', 'Don\'t output the footer' ); + $this->addOption( 'stub', 'Don\'t perform old_text lookups; for 2-pass dump' ); + $this->addOption( 'uploads', 'Include upload records without files' ); + $this->addOption( 'include-files', 'Include files within the XML stream' ); + + if ( $args ) { + $this->loadWithArgv( $args ); + $this->processOptions(); + } + } -Usage: php dumpBackup.php [] -Actions: - --full Dump all revisions of every page. - --current Dump only the latest revision of every page. - --logs Dump all log events. - --stable Stable versions of pages? - --pagelist= - Where is a list of page titles to be dumped - --revrange Dump specified range of revisions, requires - revstart and revend options. -Options: - --quiet Don't dump status reports to stderr. - --report=n Report position and speed after every n pages processed. - (Default: 100) - --server=h Force reading from MySQL server h - --start=n Start from page_id or log_id n - --end=n Stop before page_id or log_id n (exclusive) - --revstart=n Start from rev_id n - --revend=n Stop before rev_id n (exclusive) - --skip-header Don't output the header - --skip-footer Don't output the footer - --stub Don't perform old_text lookups; for 2-pass dump - --uploads Include upload records without files - --include-files Include files within the XML stream - --conf= Use the specified configuration file (LocalSettings.php) - - --wiki= Only back up the specified - -Fancy stuff: (Works? Add examples please.) - --plugin=[:] Load a dump plugin class - --output=: Begin a filtered output stream; - s: file, gzip, bzip2, 7zip - --filter=[:] Add a filter on an output branch - -ENDS - ); + function execute() { + $this->processOptions(); + + $textMode = $this->hasOption( 'stub' ) ? WikiExporter::STUB : WikiExporter::TEXT; + + if ( $this->hasOption( 'full' ) ) { + $this->dump( WikiExporter::FULL, $textMode ); + } elseif ( $this->hasOption( 'current' ) ) { + $this->dump( WikiExporter::CURRENT, $textMode ); + } elseif ( $this->hasOption( 'stable' ) ) { + $this->dump( WikiExporter::STABLE, $textMode ); + } elseif ( $this->hasOption( 'logs' ) ) { + $this->dump( WikiExporter::LOGS ); + } elseif ( $this->hasOption( 'revrange' ) ) { + $this->dump( WikiExporter::RANGE, $textMode ); + } else { + $this->error( 'No valid action specified.', 1 ); + } + } + + function processOptions() { + parent::processOptions(); + + // Evaluate options specific to this class + $this->reporting = !$this->hasOption( 'quiet' ); + + if ( $this->hasOption( 'pagelist' ) ) { + $olddir = getcwd(); + chdir( $originalDir ); + $pages = file( $this->getOption( 'quiet' ) ); + chdir( $olddir ); + if ( $pages === false ) { + echo "Unable to open file {$options['pagelist']}\n"; + die( 1 ); + } + $pages = array_map( 'trim', $pages ); + $this->pages = array_filter( $pages, create_function( '$x', 'return $x !== "";' ) ); + } + + if ( $this->hasOption( 'start' ) ) { + $this->startId = intval( $this->getOption( 'start' ) ); + } + + if ( $this->hasOption( 'end' ) ) { + $this->endId = intval( $this->getOption( 'end' ) ); + } + + if ( $this->hasOption( 'revstart' ) ) { + $this->revStartId = intval( $this->getOption( 'revstart' ) ); + } + + if ( $this->hasOption( 'revend' ) ) { + $this->revEndId = intval( $this->getOption( 'revend' ) ); + } + + $this->skipHeader = $this->hasOption( 'skip-header' ); + $this->skipFooter = $this->hasOption( 'skip-footer' ); + $this->dumpUploads = $this->hasOption( 'uploads' ); + $this->dumpUploadFileContents = $this->hasOption( 'include-files' ); + } } + +$maintClass = 'DumpBackup'; +require_once RUN_MAINTENANCE_IF_MAIN; diff --git a/maintenance/dumpTextPass.php b/maintenance/dumpTextPass.php index bde5a07623..7511392c2c 100644 --- a/maintenance/dumpTextPass.php +++ b/maintenance/dumpTextPass.php @@ -1,6 +1,6 @@ * https://www.mediawiki.org/ @@ -24,44 +24,962 @@ * @ingroup Maintenance */ -$originalDir = getcwd(); +require_once __DIR__ . '/backup.inc'; +require_once __DIR__ . '/../includes/export/WikiExporter.php'; -require_once __DIR__ . '/commandLine.inc'; -require_once __DIR__ . '/backupTextPass.inc'; +/** + * @ingroup Maintenance + */ +class TextPassDumper extends BackupDumper { + public $prefetch = null; + + // when we spend more than maxTimeAllowed seconds on this run, we continue + // processing until we write out the next complete page, then save output file(s), + // rename it/them and open new one(s) + public $maxTimeAllowed = 0; // 0 = no limit + + protected $input = "php://stdin"; + protected $history = WikiExporter::FULL; + protected $fetchCount = 0; + protected $prefetchCount = 0; + protected $prefetchCountLast = 0; + protected $fetchCountLast = 0; + + protected $maxFailures = 5; + protected $maxConsecutiveFailedTextRetrievals = 200; + protected $failureTimeout = 5; // Seconds to sleep after db failure + + protected $bufferSize = 524288; // In bytes. Maximum size to read from the stub in on go. + + protected $php = "php"; + protected $spawn = false; + + /** + * @var bool|resource + */ + protected $spawnProc = false; -$dumper = new TextPassDumper( $argv ); + /** + * @var bool|resource + */ + protected $spawnWrite = false; -if ( !isset( $options['help'] ) ) { - $dumper->dump( true ); -} else { - $dumper->progress( <<mDescription = <<stderr = fopen( "php://stderr", "wt" ); + + $this->addOption( 'stub', 'To load a compressed stub dump instead of stdin. ' . + 'Specify as --stub=:.', false, true ); + $this->addOption( 'prefetch', 'Use a prior dump file as a text source, to savepressure on the ' . + 'database. (Requires the XMLReader extension). Specify as --prefetch=:', + false, true ); + $this->addOption( 'maxtime', 'Write out checkpoint file after this many minutes (writing' . + 'out complete page, closing xml file properly, and opening new one' . + 'with header). This option requires the checkpointfile option.', false, true ); + $this->addOption( 'checkpointfile', 'Use this string for checkpoint filenames,substituting ' . + 'first pageid written for the first %s (required) and the last pageid written for the ' . + 'second %s if it exists.', false, true, false, true ); // This can be specified multiple times + $this->addOption( 'quiet', 'Don\'t dump status reports to stderr.' ); + $this->addOption( 'current', 'Base ETA on number of pages in database instead of all revisions' ); + $this->addOption( 'spawn', 'Spawn a subprocess for loading text records' ); + $this->addOption( 'buffersize', 'Buffer size in bytes to use for reading the stub. ' . + '(Default: 512KB, Minimum: 4KB)', false, true ); + + if ( $args ) { + $this->loadWithArgv( $args ); + $this->processOptions(); + } + } + + function execute() { + $this->processOptions(); + $this->dump( true ); + } + + function processOptions() { + global $IP; + + parent::processOptions(); + + if ( $this->hasOption( 'buffersize' ) ) { + $this->bufferSize = max( intval( $this->getOption( 'buffersize' ) ), 4 * 1024 ); + } + + if ( $this->hasOption( 'prefetch' ) ) { + require_once "$IP/maintenance/backupPrefetch.inc"; + $url = $this->processFileOpt( $this->getOption( 'prefetch' ) ); + $this->prefetch = new BaseDump( $url ); + } + + if ( $this->hasOption( 'stub' ) ) { + $this->input = $this->processFileOpt( $this->getOption( 'stub' ) ); + } + + if ( $this->hasOption( 'maxtime' ) ) { + $this->maxTimeAllowed = intval( $this->getOption( 'maxtime' ) ) * 60; + } + + if ( $this->hasOption( 'checkpointfile' ) ) { + $this->checkpointFiles = $this->getOption( 'checkpointfile' ); + } + + if ( $this->hasOption( 'current' ) ) { + $this->history = WikiExporter::CURRENT; + } + + if ( $this->hasOption( 'full' ) ) { + $this->history = WikiExporter::FULL; + } + + if ( $this->hasOption( 'spawn' ) ) { + $this->spawn = true; + $val = $this->getOption( 'spawn' ); + if ( $val !== 1 ) { + $this->php = $val; + } + } + } + + /** + * Drop the database connection $this->db and try to get a new one. + * + * This function tries to get a /different/ connection if this is + * possible. Hence, (if this is possible) it switches to a different + * failover upon each call. + * + * This function resets $this->lb and closes all connections on it. + * + * @throws MWException + */ + function rotateDb() { + // Cleaning up old connections + if ( isset( $this->lb ) ) { + $this->lb->closeAll(); + unset( $this->lb ); + } + + if ( $this->forcedDb !== null ) { + $this->db = $this->forcedDb; + + return; + } + + if ( isset( $this->db ) && $this->db->isOpen() ) { + throw new MWException( 'DB is set and has not been closed by the Load Balancer' ); + } + + unset( $this->db ); + + // Trying to set up new connection. + // We do /not/ retry upon failure, but delegate to encapsulating logic, to avoid + // individually retrying at different layers of code. + + // 1. The LoadBalancer. + try { + $this->lb = wfGetLBFactory()->newMainLB(); + } catch ( Exception $e ) { + throw new MWException( __METHOD__ + . " rotating DB failed to obtain new load balancer (" . $e->getMessage() . ")" ); + } + + // 2. The Connection, through the load balancer. + try { + $this->db = $this->lb->getConnection( DB_SLAVE, 'dump' ); + } catch ( Exception $e ) { + throw new MWException( __METHOD__ + . " rotating DB failed to obtain new database (" . $e->getMessage() . ")" ); + } + } + + function initProgress( $history = WikiExporter::FULL ) { + parent::initProgress(); + $this->timeOfCheckpoint = $this->startTime; + } + + function dump( $history, $text = WikiExporter::TEXT ) { + // Notice messages will foul up your XML output even if they're + // relatively harmless. + if ( ini_get( 'display_errors' ) ) { + ini_set( 'display_errors', 'stderr' ); + } + + $this->initProgress( $this->history ); + + // We are trying to get an initial database connection to avoid that the + // first try of this request's first call to getText fails. However, if + // obtaining a good DB connection fails it's not a serious issue, as + // getText does retry upon failure and can start without having a working + // DB connection. + try { + $this->rotateDb(); + } catch ( Exception $e ) { + // We do not even count this as failure. Just let eventual + // watchdogs know. + $this->progress( "Getting initial DB connection failed (" . + $e->getMessage() . ")" ); + } + + $this->egress = new ExportProgressFilter( $this->sink, $this ); + + // it would be nice to do it in the constructor, oh well. need egress set + $this->finalOptionCheck(); + + // we only want this so we know how to close a stream :-P + $this->xmlwriterobj = new XmlDumpWriter(); + + $input = fopen( $this->input, "rt" ); + $this->readDump( $input ); + + if ( $this->spawnProc ) { + $this->closeSpawn(); + } + + $this->report( true ); + } + + function processFileOpt( $opt ) { + $split = explode( ':', $opt, 2 ); + $val = $split[0]; + $param = ''; + if ( count( $split ) === 2 ) { + $param = $split[1]; + } + $fileURIs = explode( ';', $param ); + foreach ( $fileURIs as $URI ) { + switch ( $val ) { + case "file": + $newURI = $URI; + break; + case "gzip": + $newURI = "compress.zlib://$URI"; + break; + case "bzip2": + $newURI = "compress.bzip2://$URI"; + break; + case "7zip": + $newURI = "mediawiki.compress.7z://$URI"; + break; + default: + $newURI = $URI; + } + $newFileURIs[] = $newURI; + } + $val = implode( ';', $newFileURIs ); + + return $val; + } + + /** + * Overridden to include prefetch ratio if enabled. + */ + function showReport() { + if ( !$this->prefetch ) { + parent::showReport(); + + return; + } + + if ( $this->reporting ) { + $now = wfTimestamp( TS_DB ); + $nowts = microtime( true ); + $deltaAll = $nowts - $this->startTime; + $deltaPart = $nowts - $this->lastTime; + $this->pageCountPart = $this->pageCount - $this->pageCountLast; + $this->revCountPart = $this->revCount - $this->revCountLast; + + if ( $deltaAll ) { + $portion = $this->revCount / $this->maxCount; + $eta = $this->startTime + $deltaAll / $portion; + $etats = wfTimestamp( TS_DB, intval( $eta ) ); + if ( $this->fetchCount ) { + $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount; + } else { + $fetchRate = '-'; + } + $pageRate = $this->pageCount / $deltaAll; + $revRate = $this->revCount / $deltaAll; + } else { + $pageRate = '-'; + $revRate = '-'; + $etats = '-'; + $fetchRate = '-'; + } + if ( $deltaPart ) { + if ( $this->fetchCountLast ) { + $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast; + } else { + $fetchRatePart = '-'; + } + $pageRatePart = $this->pageCountPart / $deltaPart; + $revRatePart = $this->revCountPart / $deltaPart; + } else { + $fetchRatePart = '-'; + $pageRatePart = '-'; + $revRatePart = '-'; + } + $this->progress( sprintf( + "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), " + . "%d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% " + . "prefetched (all|curr), ETA %s [max %d]", + $now, wfWikiID(), $this->ID, $this->pageCount, $pageRate, + $pageRatePart, $this->revCount, $revRate, $revRatePart, + $fetchRate, $fetchRatePart, $etats, $this->maxCount + ) ); + $this->lastTime = $nowts; + $this->revCountLast = $this->revCount; + $this->prefetchCountLast = $this->prefetchCount; + $this->fetchCountLast = $this->fetchCount; + } + } + + function setTimeExceeded() { + $this->timeExceeded = true; + } + + function checkIfTimeExceeded() { + if ( $this->maxTimeAllowed + && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) + ) { + return true; + } + + return false; + } + + function finalOptionCheck() { + if ( ( $this->checkpointFiles && !$this->maxTimeAllowed ) + || ( $this->maxTimeAllowed && !$this->checkpointFiles ) + ) { + throw new MWException( "Options checkpointfile and maxtime must be specified together.\n" ); + } + foreach ( $this->checkpointFiles as $checkpointFile ) { + $count = substr_count( $checkpointFile, "%s" ); + if ( $count != 2 ) { + throw new MWException( "Option checkpointfile must contain two '%s' " + . "for substitution of first and last pageids, count is $count instead, " + . "file is $checkpointFile.\n" ); + } + } + + if ( $this->checkpointFiles ) { + $filenameList = (array)$this->egress->getFilenames(); + if ( count( $filenameList ) != count( $this->checkpointFiles ) ) { + throw new MWException( "One checkpointfile must be specified " + . "for each output option, if maxtime is used.\n" ); + } + } + } + + /** + * @throws MWException Failure to parse XML input + * @param string $input + * @return bool + */ + function readDump( $input ) { + $this->buffer = ""; + $this->openElement = false; + $this->atStart = true; + $this->state = ""; + $this->lastName = ""; + $this->thisPage = 0; + $this->thisRev = 0; + $this->thisRevModel = null; + $this->thisRevFormat = null; + + $parser = xml_parser_create( "UTF-8" ); + xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); + + xml_set_element_handler( + $parser, + array( &$this, 'startElement' ), + array( &$this, 'endElement' ) + ); + xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); + + $offset = 0; // for context extraction on error reporting + do { + if ( $this->checkIfTimeExceeded() ) { + $this->setTimeExceeded(); + } + $chunk = fread( $input, $this->bufferSize ); + if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { + wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); + + $byte = xml_get_current_byte_index( $parser ); + $msg = wfMessage( 'xml-error-string', + 'XML import parse failure', + xml_get_current_line_number( $parser ), + xml_get_current_column_number( $parser ), + $byte . ( is_null( $chunk ) ? null : ( '; "' . substr( $chunk, $byte - $offset, 16 ) . '"' ) ), + xml_error_string( xml_get_error_code( $parser ) ) )->escaped(); + + xml_parser_free( $parser ); + + throw new MWException( $msg ); + } + $offset += strlen( $chunk ); + } while ( $chunk !== false && !feof( $input ) ); + if ( $this->maxTimeAllowed ) { + $filenameList = (array)$this->egress->getFilenames(); + // we wrote some stuff after last checkpoint that needs renamed + if ( file_exists( $filenameList[0] ) ) { + $newFilenames = array(); + # we might have just written the header and footer and had no + # pages or revisions written... perhaps they were all deleted + # there's no pageID 0 so we use that. the caller is responsible + # for deciding what to do with a file containing only the + # siteinfo information and the mw tags. + if ( !$this->firstPageWritten ) { + $firstPageID = str_pad( 0, 9, "0", STR_PAD_LEFT ); + $lastPageID = str_pad( 0, 9, "0", STR_PAD_LEFT ); + } else { + $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT ); + $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT ); + } + + $filenameCount = count( $filenameList ); + for ( $i = 0; $i < $filenameCount; $i++ ) { + $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID ); + $fileinfo = pathinfo( $filenameList[$i] ); + $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn; + } + $this->egress->closeAndRename( $newFilenames ); + } + } + xml_parser_free( $parser ); + + return true; + } + + /** + * Applies applicable export transformations to $text. + * + * @param string $text + * @param string $model + * @param string|null $format + * + * @return string + */ + private function exportTransform( $text, $model, $format = null ) { + try { + $handler = ContentHandler::getForModelID( $model ); + $text = $handler->exportTransform( $text, $format ); + } + catch ( MWException $ex ) { + $this->progress( + "Unable to apply export transformation for content model '$model': " . + $ex->getMessage() + ); + } -Usage: php dumpTextPass.php [] -Options: - --stub=: To load a compressed stub dump instead of stdin - --prefetch=: Use a prior dump file as a text source, to save - pressure on the database. - (Requires the XMLReader extension) - --maxtime= Write out checkpoint file after this many minutes (writing - out complete page, closing xml file properly, and opening new one - with header). This option requires the checkpointfile option. - --checkpointfile= Use this string for checkpoint filenames, - substituting first pageid written for the first %s (required) and the - last pageid written for the second %s if it exists. - --quiet Don't dump status reports to stderr. - --report=n Report position and speed after every n pages processed. - (Default: 100) - --server=h Force reading from MySQL server h - --current Base ETA on number of pages in database instead of all revisions - --spawn Spawn a subprocess for loading text records - --buffersize= Buffer size in bytes to use for reading the stub. - (Default: 512KB, Minimum: 4KB) - --help Display this help message -ENDS - ); + return $text; + } + + /** + * Tries to get the revision text for a revision id. + * Export transformations are applied if the content model can is given or can be + * determined from the database. + * + * Upon errors, retries (Up to $this->maxFailures tries each call). + * If still no good revision get could be found even after this retrying, "" is returned. + * If no good revision text could be returned for + * $this->maxConsecutiveFailedTextRetrievals consecutive calls to getText, MWException + * is thrown. + * + * @param string $id The revision id to get the text for + * @param string|bool|null $model The content model used to determine + * applicable export transformations. + * If $model is null, it will be determined from the database. + * @param string|null $format The content format used when applying export transformations. + * + * @throws MWException + * @return string The revision text for $id, or "" + */ + function getText( $id, $model = null, $format = null ) { + global $wgContentHandlerUseDB; + + $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch. + $text = false; // The candidate for a good text. false if no proper value. + $failures = 0; // The number of times, this invocation of getText already failed. + + // The number of times getText failed without yielding a good text in between. + static $consecutiveFailedTextRetrievals = 0; + + $this->fetchCount++; + + // To allow to simply return on success and do not have to worry about book keeping, + // we assume, this fetch works (possible after some retries). Nevertheless, we koop + // the old value, so we can restore it, if problems occur (See after the while loop). + $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals; + $consecutiveFailedTextRetrievals = 0; + + if ( $model === null && $wgContentHandlerUseDB ) { + $row = $this->db->selectRow( + 'revision', + array( 'rev_content_model', 'rev_content_format' ), + array( 'rev_id' => $this->thisRev ), + __METHOD__ + ); + + if ( $row ) { + $model = $row->rev_content_model; + $format = $row->rev_content_format; + } + } + + if ( $model === null || $model === '' ) { + $model = false; + } + + while ( $failures < $this->maxFailures ) { + + // As soon as we found a good text for the $id, we will return immediately. + // Hence, if we make it past the try catch block, we know that we did not + // find a good text. + + try { + // Step 1: Get some text (or reuse from previous iteratuon if checking + // for plausibility failed) + + // Trying to get prefetch, if it has not been tried before + if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) { + $prefetchNotTried = false; + $tryIsPrefetch = true; + $text = $this->prefetch->prefetch( intval( $this->thisPage ), + intval( $this->thisRev ) ); + + if ( $text === null ) { + $text = false; + } + + if ( is_string( $text ) && $model !== false ) { + // Apply export transformation to text coming from an old dump. + // The purpose of this transformation is to convert up from legacy + // formats, which may still be used in the older dump that is used + // for pre-fetching. Applying the transformation again should not + // interfere with content that is already in the correct form. + $text = $this->exportTransform( $text, $model, $format ); + } + } + + if ( $text === false ) { + // Fallback to asking the database + $tryIsPrefetch = false; + if ( $this->spawn ) { + $text = $this->getTextSpawned( $id ); + } else { + $text = $this->getTextDb( $id ); + } + + if ( $text !== false && $model !== false ) { + // Apply export transformation to text coming from the database. + // Prefetched text should already have transformations applied. + $text = $this->exportTransform( $text, $model, $format ); + } + + // No more checks for texts from DB for now. + // If we received something that is not false, + // We treat it as good text, regardless of whether it actually is or is not + if ( $text !== false ) { + return $text; + } + } + + if ( $text === false ) { + throw new MWException( "Generic error while obtaining text for id " . $id ); + } + + // We received a good candidate for the text of $id via some method + + // Step 2: Checking for plausibility and return the text if it is + // plausible + $revID = intval( $this->thisRev ); + if ( !isset( $this->db ) ) { + throw new MWException( "No database available" ); + } + + if ( $model !== CONTENT_MODEL_WIKITEXT ) { + $revLength = strlen( $text ); + } else { + $revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) ); + } + + if ( strlen( $text ) == $revLength ) { + if ( $tryIsPrefetch ) { + $this->prefetchCount++; + } + + return $text; + } + + $text = false; + throw new MWException( "Received text is unplausible for id " . $id ); + } catch ( Exception $e ) { + $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")"; + if ( $failures + 1 < $this->maxFailures ) { + $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)"; + } + $this->progress( $msg ); + } + + // Something went wrong; we did not a text that was plausible :( + $failures++; + + // A failure in a prefetch hit does not warrant resetting db connection etc. + if ( !$tryIsPrefetch ) { + // After backing off for some time, we try to reboot the whole process as + // much as possible to not carry over failures from one part to the other + // parts + sleep( $this->failureTimeout ); + try { + $this->rotateDb(); + if ( $this->spawn ) { + $this->closeSpawn(); + $this->openSpawn(); + } + } catch ( Exception $e ) { + $this->progress( "Rebooting getText infrastructure failed (" . $e->getMessage() . ")" . + " Trying to continue anyways" ); + } + } + } + + // Retirieving a good text for $id failed (at least) maxFailures times. + // We abort for this $id. + + // Restoring the consecutive failures, and maybe aborting, if the dump + // is too broken. + $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1; + if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) { + throw new MWException( "Graceful storage failure" ); + } + + return ""; + } + + /** + * May throw a database error if, say, the server dies during query. + * @param int $id + * @return bool|string + * @throws MWException + */ + private function getTextDb( $id ) { + global $wgContLang; + if ( !isset( $this->db ) ) { + throw new MWException( __METHOD__ . "No database available" ); + } + $row = $this->db->selectRow( 'text', + array( 'old_text', 'old_flags' ), + array( 'old_id' => $id ), + __METHOD__ ); + $text = Revision::getRevisionText( $row ); + if ( $text === false ) { + return false; + } + $stripped = str_replace( "\r", "", $text ); + $normalized = $wgContLang->normalize( $stripped ); + + return $normalized; + } + + private function getTextSpawned( $id ) { + MediaWiki\suppressWarnings(); + if ( !$this->spawnProc ) { + // First time? + $this->openSpawn(); + } + $text = $this->getTextSpawnedOnce( $id ); + MediaWiki\restoreWarnings(); + + return $text; + } + + function openSpawn() { + global $IP; + + if ( file_exists( "$IP/../multiversion/MWScript.php" ) ) { + $cmd = implode( " ", + array_map( 'wfEscapeShellArg', + array( + $this->php, + "$IP/../multiversion/MWScript.php", + "fetchText.php", + '--wiki', wfWikiID() ) ) ); + } else { + $cmd = implode( " ", + array_map( 'wfEscapeShellArg', + array( + $this->php, + "$IP/maintenance/fetchText.php", + '--wiki', wfWikiID() ) ) ); + } + $spec = array( + 0 => array( "pipe", "r" ), + 1 => array( "pipe", "w" ), + 2 => array( "file", "/dev/null", "a" ) ); + $pipes = array(); + + $this->progress( "Spawning database subprocess: $cmd" ); + $this->spawnProc = proc_open( $cmd, $spec, $pipes ); + if ( !$this->spawnProc ) { + $this->progress( "Subprocess spawn failed." ); + + return false; + } + list( + $this->spawnWrite, // -> stdin + $this->spawnRead, // <- stdout + ) = $pipes; + + return true; + } + + private function closeSpawn() { + MediaWiki\suppressWarnings(); + if ( $this->spawnRead ) { + fclose( $this->spawnRead ); + } + $this->spawnRead = false; + if ( $this->spawnWrite ) { + fclose( $this->spawnWrite ); + } + $this->spawnWrite = false; + if ( $this->spawnErr ) { + fclose( $this->spawnErr ); + } + $this->spawnErr = false; + if ( $this->spawnProc ) { + pclose( $this->spawnProc ); + } + $this->spawnProc = false; + MediaWiki\restoreWarnings(); + } + + private function getTextSpawnedOnce( $id ) { + global $wgContLang; + + $ok = fwrite( $this->spawnWrite, "$id\n" ); + // $this->progress( ">> $id" ); + if ( !$ok ) { + return false; + } + + $ok = fflush( $this->spawnWrite ); + // $this->progress( ">> [flush]" ); + if ( !$ok ) { + return false; + } + + // check that the text id they are sending is the one we asked for + // this avoids out of sync revision text errors we have encountered in the past + $newId = fgets( $this->spawnRead ); + if ( $newId === false ) { + return false; + } + if ( $id != intval( $newId ) ) { + return false; + } + + $len = fgets( $this->spawnRead ); + // $this->progress( "<< " . trim( $len ) ); + if ( $len === false ) { + return false; + } + + $nbytes = intval( $len ); + // actual error, not zero-length text + if ( $nbytes < 0 ) { + return false; + } + + $text = ""; + + // Subprocess may not send everything at once, we have to loop. + while ( $nbytes > strlen( $text ) ) { + $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) ); + if ( $buffer === false ) { + break; + } + $text .= $buffer; + } + + $gotbytes = strlen( $text ); + if ( $gotbytes != $nbytes ) { + $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " ); + + return false; + } + + // Do normalization in the dump thread... + $stripped = str_replace( "\r", "", $text ); + $normalized = $wgContLang->normalize( $stripped ); + + return $normalized; + } + + function startElement( $parser, $name, $attribs ) { + $this->checkpointJustWritten = false; + + $this->clearOpenElement( null ); + $this->lastName = $name; + + if ( $name == 'revision' ) { + $this->state = $name; + $this->egress->writeOpenPage( null, $this->buffer ); + $this->buffer = ""; + } elseif ( $name == 'page' ) { + $this->state = $name; + if ( $this->atStart ) { + $this->egress->writeOpenStream( $this->buffer ); + $this->buffer = ""; + $this->atStart = false; + } + } + + if ( $name == "text" && isset( $attribs['id'] ) ) { + $id = $attribs['id']; + $model = trim( $this->thisRevModel ); + $format = trim( $this->thisRevFormat ); + + $model = $model === '' ? null : $model; + $format = $format === '' ? null : $format; + + $text = $this->getText( $id, $model, $format ); + $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); + if ( strlen( $text ) > 0 ) { + $this->characterData( $parser, $text ); + } + } else { + $this->openElement = array( $name, $attribs ); + } + } + + function endElement( $parser, $name ) { + $this->checkpointJustWritten = false; + + if ( $this->openElement ) { + $this->clearOpenElement( "" ); + } else { + $this->buffer .= ""; + } + + if ( $name == 'revision' ) { + $this->egress->writeRevision( null, $this->buffer ); + $this->buffer = ""; + $this->thisRev = ""; + $this->thisRevModel = null; + $this->thisRevFormat = null; + } elseif ( $name == 'page' ) { + if ( !$this->firstPageWritten ) { + $this->firstPageWritten = trim( $this->thisPage ); + } + $this->lastPageWritten = trim( $this->thisPage ); + if ( $this->timeExceeded ) { + $this->egress->writeClosePage( $this->buffer ); + // nasty hack, we can't just write the chardata after the + // page tag, it will include leading blanks from the next line + $this->egress->sink->write( "\n" ); + + $this->buffer = $this->xmlwriterobj->closeStream(); + $this->egress->writeCloseStream( $this->buffer ); + + $this->buffer = ""; + $this->thisPage = ""; + // this could be more than one file if we had more than one output arg + + $filenameList = (array)$this->egress->getFilenames(); + $newFilenames = array(); + $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT ); + $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT ); + $filenamesCount = count( $filenameList ); + for ( $i = 0; $i < $filenamesCount; $i++ ) { + $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID ); + $fileinfo = pathinfo( $filenameList[$i] ); + $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn; + } + $this->egress->closeRenameAndReopen( $newFilenames ); + $this->buffer = $this->xmlwriterobj->openStream(); + $this->timeExceeded = false; + $this->timeOfCheckpoint = $this->lastTime; + $this->firstPageWritten = false; + $this->checkpointJustWritten = true; + } else { + $this->egress->writeClosePage( $this->buffer ); + $this->buffer = ""; + $this->thisPage = ""; + } + } elseif ( $name == 'mediawiki' ) { + $this->egress->writeCloseStream( $this->buffer ); + $this->buffer = ""; + } + } + + function characterData( $parser, $data ) { + $this->clearOpenElement( null ); + if ( $this->lastName == "id" ) { + if ( $this->state == "revision" ) { + $this->thisRev .= $data; + } elseif ( $this->state == "page" ) { + $this->thisPage .= $data; + } + } elseif ( $this->lastName == "model" ) { + $this->thisRevModel .= $data; + } elseif ( $this->lastName == "format" ) { + $this->thisRevFormat .= $data; + } + + // have to skip the newline left over from closepagetag line of + // end of checkpoint files. nasty hack!! + if ( $this->checkpointJustWritten ) { + if ( $data[0] == "\n" ) { + $data = substr( $data, 1 ); + } + $this->checkpointJustWritten = false; + } + $this->buffer .= htmlspecialchars( $data ); + } + + function clearOpenElement( $style ) { + if ( $this->openElement ) { + $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); + $this->openElement = false; + } + } } + +$maintClass = 'TextPassDumper'; +require_once RUN_MAINTENANCE_IF_MAIN; diff --git a/tests/phpunit/maintenance/backupTextPassTest.php b/tests/phpunit/maintenance/backupTextPassTest.php index f5dd98b3fb..893e4f90f3 100644 --- a/tests/phpunit/maintenance/backupTextPassTest.php +++ b/tests/phpunit/maintenance/backupTextPassTest.php @@ -1,10 +1,15 @@ setUpStub(); $nameFull = $this->getNewTempFile(); - $dumper = new TextPassDumper( array( "--stub=file:" - . $nameStub, "--output=file:" . $nameFull ) ); + + $dumper = new TextPassDumper( array( "--stub=file:" . $nameStub, + "--output=file:" . $nameFull ) ); + $dumper->prefetch = $prefetchMock; $dumper->reporting = false; $dumper->setDb( $this->db ); @@ -261,7 +268,8 @@ class TextPassDumperDatabaseTest extends DumpTestCase { $this->assertTrue( wfMkdirParents( $nameOutputDir ), "Creating temporary output directory " ); $this->setUpStub( $nameStub, $iterations ); - $dumper = new TextPassDumper( array( "--stub=file:" . $nameStub, + $dumper = new TextPassDumper(); + $dumper->loadWithArgv( array( "--stub=file:" . $nameStub, "--output=" . $checkpointFormat . ":" . $nameOutputDir . "/full", "--maxtime=1" /*This is in minutes. Fixup is below*/, "--buffersize=32768", // The default of 32 iterations fill up 32KB about twice @@ -272,7 +280,7 @@ class TextPassDumperDatabaseTest extends DumpTestCase { // The actual dump and taking time $ts_before = microtime( true ); - $dumper->dump( WikiExporter::FULL, WikiExporter::TEXT ); + $dumper->execute(); $ts_after = microtime( true ); $lastDuration = $ts_after - $ts_before; @@ -634,7 +642,9 @@ class TextPassDumperDatabaselessTest extends MediaWikiLangTestCase { * @dataProvider bufferSizeProvider */ function testBufferSizeSetting( $expected, $size, $msg ) { - $dumper = new TextPassDumperAccessor( array( "--buffersize=" . $size ) ); + $dumper = new TextPassDumperAccessor(); + $dumper->loadWithArgv( array( "--buffersize=" . $size ) ); + $dumper->execute(); $this->assertEquals( $expected, $dumper->getBufferSize(), $msg ); } @@ -674,4 +684,8 @@ class TextPassDumperAccessor extends TextPassDumper { public function getBufferSize() { return $this->bufferSize; } + + function dump( $history, $text = null ) { + return true; + } } diff --git a/tests/phpunit/maintenance/backup_LogTest.php b/tests/phpunit/maintenance/backup_LogTest.php index 7ca45960c5..6629b67dba 100644 --- a/tests/phpunit/maintenance/backup_LogTest.php +++ b/tests/phpunit/maintenance/backup_LogTest.php @@ -2,6 +2,11 @@ /** * Tests for log dumps of BackupDumper * + * Some of these tests use the old constuctor for TextPassDumper + * and the dump() function, while others use the new loadWithArgv( $args ) + * function and execute(). This is to ensure both the old and new methods + * work properly. + * * @group Database * @group Dump * @covers BackupDumper @@ -136,7 +141,8 @@ class BackupDumperLoggerTest extends DumpTestCase { // Preparing the dump $fname = $this->getNewTempFile(); - $dumper = new BackupDumper( array( "--output=file:" . $fname ) ); + + $dumper = new DumpBackup( array( '--output=file:' . $fname ) ); $dumper->startId = $this->logId1; $dumper->endId = $this->logId3 + 1; $dumper->reporting = false; @@ -173,8 +179,10 @@ class BackupDumperLoggerTest extends DumpTestCase { // Preparing the dump $fname = $this->getNewTempFile(); - $dumper = new BackupDumper( array( "--output=gzip:" . $fname, - "--reporting=2" ) ); + + $dumper = new DumpBackup(); + $dumper->loadWithArgv( array( '--logs', '--output=gzip:' . $fname, + '--reporting=2' ) ); $dumper->startId = $this->logId1; $dumper->endId = $this->logId3 + 1; $dumper->setDb( $this->db ); @@ -190,7 +198,7 @@ class BackupDumperLoggerTest extends DumpTestCase { } // Performing the dump - $dumper->dump( WikiExporter::LOGS, WikiExporter::TEXT ); + $dumper->execute(); $this->assertTrue( fclose( $dumper->stderr ), "Closing stderr handle" ); diff --git a/tests/phpunit/maintenance/backup_PageTest.php b/tests/phpunit/maintenance/backup_PageTest.php index 8b6221ba92..5781d1c6d5 100644 --- a/tests/phpunit/maintenance/backup_PageTest.php +++ b/tests/phpunit/maintenance/backup_PageTest.php @@ -6,6 +6,7 @@ * @group Dump * @covers BackupDumper */ + class BackupDumperPageTest extends DumpTestCase { // We'll add several pages, revision and texts. The following variables hold the @@ -98,14 +99,15 @@ class BackupDumperPageTest extends DumpTestCase { function testFullTextPlain() { // Preparing the dump $fname = $this->getNewTempFile(); - $dumper = new BackupDumper( array( "--output=file:" . $fname ) ); + + $dumper = new DumpBackup(); + $dumper->loadWithArgv( array( '--full', '--quiet', '--output', 'file:' . $fname ) ); $dumper->startId = $this->pageId1; $dumper->endId = $this->pageId4 + 1; - $dumper->reporting = false; $dumper->setDb( $this->db ); // Performing the dump - $dumper->dump( WikiExporter::FULL, WikiExporter::TEXT ); + $dumper->execute(); // Checking the dumped data $this->assertDumpStart( $fname ); @@ -153,14 +155,15 @@ class BackupDumperPageTest extends DumpTestCase { function testFullStubPlain() { // Preparing the dump $fname = $this->getNewTempFile(); - $dumper = new BackupDumper( array( "--output=file:" . $fname ) ); + + $dumper = new DumpBackup(); + $dumper->loadWithArgv( array( '--full', '--quiet', '--output', 'file:' . $fname, '--stub' ) ); $dumper->startId = $this->pageId1; $dumper->endId = $this->pageId4 + 1; - $dumper->reporting = false; $dumper->setDb( $this->db ); // Performing the dump - $dumper->dump( WikiExporter::FULL, WikiExporter::STUB ); + $dumper->execute(); // Checking the dumped data $this->assertDumpStart( $fname ); @@ -202,7 +205,8 @@ class BackupDumperPageTest extends DumpTestCase { function testCurrentStubPlain() { // Preparing the dump $fname = $this->getNewTempFile(); - $dumper = new BackupDumper( array( "--output=file:" . $fname ) ); + + $dumper = new DumpBackup( array( '--output', 'file:' . $fname ) ); $dumper->startId = $this->pageId1; $dumper->endId = $this->pageId4 + 1; $dumper->reporting = false; @@ -247,7 +251,8 @@ class BackupDumperPageTest extends DumpTestCase { // Preparing the dump $fname = $this->getNewTempFile(); - $dumper = new BackupDumper( array( "--output=gzip:" . $fname ) ); + + $dumper = new DumpBackup( array( '--output', 'gzip:' . $fname ) ); $dumper->startId = $this->pageId1; $dumper->endId = $this->pageId4 + 1; $dumper->reporting = false; @@ -306,7 +311,7 @@ class BackupDumperPageTest extends DumpTestCase { $fnameMetaCurrent = $this->getNewTempFile(); $fnameArticles = $this->getNewTempFile(); - $dumper = new BackupDumper( array( "--output=gzip:" . $fnameMetaHistory, + $dumper = new DumpBackup( array( "--full", "--stub", "--output=gzip:" . $fnameMetaHistory, "--output=gzip:" . $fnameMetaCurrent, "--filter=latest", "--output=gzip:" . $fnameArticles, "--filter=latest", "--filter=notalk", "--filter=namespace:!NS_USER", -- 2.20.1