From ee372c39513f5153696ccad2b6f4aa69531b5632 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 16 Oct 2005 17:33:41 +0000 Subject: [PATCH] * Two-pass data dump for friendliness to the DB (--stub, then dumpTextPass.php) * Data dump 'prefetch' mode to read normalized text from a prior dump (requires PHP 5, XMLReader extension) Maybe these will keep dammit from killing our dump runs... --- RELEASE-NOTES | 3 + includes/Export.php | 46 +++++-- maintenance/backup.inc | 227 +++++++++++++++++++++++++++++++++ maintenance/backupPrefetch.inc | 141 ++++++++++++++++++++ maintenance/dumpBackup.php | 204 +---------------------------- maintenance/dumpTextPass.php | 202 +++++++++++++++++++++++++++++ 6 files changed, 612 insertions(+), 211 deletions(-) create mode 100644 maintenance/backup.inc create mode 100644 maintenance/backupPrefetch.inc create mode 100644 maintenance/dumpTextPass.php diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 97ecea5b44..10421f8f7b 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -152,6 +152,9 @@ fully support the editing toolbar, but was found to be too confusing. * Fix for hook callbacks on objects containing no fields * (bug 3711) Removed invisible unicode characters from LanguageHu * (bug 2330) Don't do funny thinks with "links" in MediaWiki:Undeletedtext +* Two-pass data dump for friendliness to the DB (--stub, then dumpTextPass.php) +* Data dump 'prefetch' mode to read normalized text from a prior dump + (requires PHP 5, XMLReader extension) === Caveats === diff --git a/includes/Export.php b/includes/Export.php index 7bbc667733..c57831f2cf 100644 --- a/includes/Export.php +++ b/includes/Export.php @@ -31,6 +31,9 @@ define( 'MW_EXPORT_CURRENT', 1 ); define( 'MW_EXPORT_BUFFER', 0 ); define( 'MW_EXPORT_STREAM', 1 ); +define( 'MW_EXPORT_TEXT', 0 ); +define( 'MW_EXPORT_STUB', 1 ); + /** * @package MediaWiki @@ -49,12 +52,13 @@ class WikiExporter { * @param int $buffer one of MW_EXPORT_BUFFER or MW_EXPORT_STREAM */ function WikiExporter( &$db, $history = MW_EXPORT_CURRENT, - $buffer = MW_EXPORT_BUFFER ) { + $buffer = MW_EXPORT_BUFFER, $text = MW_EXPORT_TEXT ) { $this->db =& $db; $this->history = $history; $this->buffer = $buffer; $this->writer = new XmlDumpWriter(); $this->sink = new DumpOutput(); + $this->text = $text; } /** @@ -158,13 +162,21 @@ class WikiExporter { $pageindex = ''; $revindex = ''; } - $result = $this->db->query( - "SELECT * FROM - $page $pageindex, - $revision $revindex, - $text - WHERE $where $join AND rev_text_id=old_id - ORDER BY page_id", $fname ); + if( $this->text == MW_EXPORT_STUB ) { + $sql = "SELECT * FROM + $page $pageindex, + $revision $revindex + WHERE $where $join + ORDER BY page_id"; + } else { + $sql = "SELECT * FROM + $page $pageindex, + $revision $revindex, + $text + WHERE $where $join AND rev_text_id=old_id + ORDER BY page_id"; + } + $result = $this->db->query( $sql, $fname ); $wrapper = $this->db->resultObject( $result ); $this->outputStream( $wrapper ); @@ -363,11 +375,19 @@ class XmlDumpWriter { if( $row->rev_comment != '' ) { $out .= " " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n"; } - - $text = strval( Revision::getRevisionText( $row ) ); - $out .= " " . wfElementClean( 'text', - array( 'xml:space' => 'preserve' ), - strval( $text ) ) . "\n"; + + if( isset( $row->old_text ) ) { + // Raw text from the database may have invalid chars + $text = strval( Revision::getRevisionText( $row ) ); + $out .= " " . wfElementClean( 'text', + array( 'xml:space' => 'preserve' ), + strval( $text ) ) . "\n"; + } else { + // Stub output + $out .= " " . wfElement( 'text', + array( 'id' => $row->rev_text_id ), + "" ) . "\n"; + } $out .= " \n"; diff --git a/maintenance/backup.inc b/maintenance/backup.inc new file mode 100644 index 0000000000..df9821839b --- /dev/null +++ b/maintenance/backup.inc @@ -0,0 +1,227 @@ + + * http://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @package MediaWiki + * @subpackage SpecialPage + */ + + +class BackupDumper { + var $reportingInterval = 100; + var $reporting = true; + var $pageCount = 0; + var $revCount = 0; + var $server = null; // use default + var $pages = null; // all pages + var $skipHeader = false; // don't output and + var $skipFooter = false; // don't output + var $startId = 0; + var $endId = 0; + var $sink = null; // Output filters + var $stubText = false; // include rev_text_id instead of text; for 2-pass dump + + function BackupDumper( $args ) { + $this->stderr = fopen( "php://stderr", "wt" ); + $this->sink = $this->processArgs( $args ); + } + + /** + * @param array $args + * @return array + * @static + */ + function processArgs( $args ) { + $outputTypes = array( + 'file' => 'DumpFileOutput', + 'gzip' => 'DumpGZipOutput', + 'bzip2' => 'DumpBZip2Output', + '7zip' => 'Dump7ZipOutput' ); + $filterTypes = array( + 'latest' => 'DumpLatestFilter', + 'notalk' => 'DumpNotalkFilter', + 'namespace' => 'DumpNamespaceFilter' ); + $sink = null; + $sinks = array(); + foreach( $args as $arg ) { + if( preg_match( '/^--(.+?)(?:=(.+?)(?::(.+?))?)?$/', $arg, $matches ) ) { + @list( $full, $opt, $val, $param ) = $matches; + switch( $opt ) { + case "output": + if( !is_null( $sink ) ) { + $sinks[] = $sink; + } + if( !isset( $outputTypes[$val] ) ) { + die( "Unrecognized output sink type '$val'\n" ); + } + $type = $outputTypes[$val]; + $sink = new $type( $param ); + break; + case "filter": + if( is_null( $sink ) ) { + $this->progress( "Warning: assuming stdout for filter output\n" ); + $sink = new DumpOutput(); + } + if( !isset( $filterTypes[$val] ) ) { + die( "Unrecognized filter type '$val'\n" ); + } + $type = $filterTypes[$val]; + $filter = new $type( $sink, $param ); + + // references are lame in php... + unset( $sink ); + $sink = $filter; + + break; + default: + $this->processOption( $opt, $val, $param ); + } + } + } + + if( is_null( $sink ) ) { + $sink = new DumpOutput(); + } + $sinks[] = $sink; + + if( count( $sinks ) > 1 ) { + return new DumpMultiWriter( $sinks ); + } else { + return $sink; + } + } + + function processOption( $opt, $val, $param ) { + // extension point for subclasses to add options + } + + function dump( $history, $text = MW_EXPORT_TEXT ) { + # This shouldn't happen if on console... ;) + header( 'Content-type: text/html; charset=UTF-8' ); + + # Notice messages will foul up your XML output even if they're + # relatively harmless. + ini_set( 'display_errors', false ); + + $this->startTime = wfTime(); + + $dbr =& wfGetDB( DB_SLAVE ); + $this->maxCount = $dbr->selectField( 'page', 'MAX(page_id)', '', 'BackupDumper::dump' ); + $this->startTime = wfTime(); + + $db =& $this->backupDb(); + $exporter = new WikiExporter( $db, $history, MW_EXPORT_STREAM, $text ); + + $wrapper = new ExportProgressFilter( $this->sink, $this ); + $exporter->setOutputSink( $wrapper ); + + if( !$this->skipHeader ) + $exporter->openStream(); + + if( is_null( $this->pages ) ) { + if( $this->startId || $this->endId ) { + $exporter->pagesByRange( $this->startId, $this->endId ); + } else { + $exporter->allPages(); + } + } else { + $exporter->pagesByName( $this->pages ); + } + + if( !$this->skipFooter ) + $exporter->closeStream(); + + $this->report( true ); + } + + function &backupDb() { + global $wgDBadminuser, $wgDBadminpassword; + global $wgDBname; + $db =& new Database( $this->backupServer(), $wgDBadminuser, $wgDBadminpassword, $wgDBname ); + $timeout = 3600 * 24; + $db->query( "SET net_read_timeout=$timeout" ); + $db->query( "SET net_write_timeout=$timeout" ); + return $db; + } + + function backupServer() { + global $wgDBserver; + return $this->server + ? $this->server + : $wgDBserver; + } + + function reportPage() { + $this->pageCount++; + $this->report(); + } + + function revCount() { + $this->revCount++; + } + + function report( $final = false ) { + if( $final xor ( $this->pageCount % $this->reportingInterval == 0 ) ) { + $this->showReport(); + } + } + + function showReport() { + if( $this->reporting ) { + $delta = wfTime() - $this->startTime; + $now = wfTimestamp( TS_DB ); + if( $delta ) { + $rate = $this->pageCount / $delta; + $revrate = $this->revCount / $delta; + $portion = $this->pageCount / $this->maxCount; + $eta = $this->startTime + $delta / $portion; + $etats = wfTimestamp( TS_DB, intval( $eta ) ); + } else { + $rate = '-'; + $revrate = '-'; + $etats = '-'; + } + global $wgDBname; + $this->progress( "$now: $wgDBname $this->pageCount, ETA $etats ($rate pages/sec $revrate revs/sec)" ); + } + } + + function progress( $string ) { + fwrite( $this->stderr, $string . "\n" ); + } +} + +class ExportProgressFilter extends DumpFilter { + function ExportProgressFilter( &$sink, &$progress ) { + parent::DumpFilter( $sink ); + $this->progress = $progress; + } + + function writeClosePage( $string ) { + parent::writeClosePage( $string ); + $this->progress->reportPage(); + } + + function writeRevision( $rev, $string ) { + parent::writeRevision( $rev, $string ); + $this->progress->revCount(); + } +} + +?> diff --git a/maintenance/backupPrefetch.inc b/maintenance/backupPrefetch.inc new file mode 100644 index 0000000000..ddb0656e0a --- /dev/null +++ b/maintenance/backupPrefetch.inc @@ -0,0 +1,141 @@ +reader = new XMLReader(); + $this->reader->open( $infile ); + } + + /** + * Attempts to fetch the text of a particular page revision + * from the dump stream. May return null if the page is + * unavailable. + * + * @param int $page ID number of page to read + * @param int $rev ID number of revision to read + * @return string or null + */ + function prefetch( $page, $rev ) { + while( $this->lastPage < $page && !$this->atEnd ) { + $this->nextPage(); + } + if( $this->lastPage > $page || $this->atEnd ) { + $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev\n" ); + return null; + } + while( $this->lastRev < $rev && !$this->atEnd ) { + $this->nextRev(); + } + if( $this->lastRev == $rev ) { + $this->debug( "BaseDump::prefetch hit on $page, $rev\n" ); + return $this->nextText(); + } else { + $this->debug( "BaseDump::prefetch already past rev $rev on page $page\n" ); + return null; + } + } + + function debug( $str ) { + wfDebug( $str ); + //global $dumper; + //$dumper->progress( $str ); + } + + /** + * @access private + */ + function nextPage() { + $this->skipTo( 'page' ); + $this->skipTo( 'id' ); + $this->lastPage = intval( $this->nodeContents() ); + $this->lastRev = 0; + } + + /** + * @access private + */ + function nextRev() { + $this->skipTo( 'revision' ); + $this->skipTo( 'id' ); + $this->lastRev = intval( $this->nodeContents() ); + } + + /** + * @access private + */ + function nextText() { + $this->skipTo( 'text' ); + return strval( $this->nodeContents() ); + } + + /** + * @access private + */ + function skipTo( $name ) { + while( $this->reader->read() ) { + if( $this->reader->nodeType == XMLREADER_ELEMENT && + $this->reader->name == $name ) { + return true; + } + } + return $this->close(); + } + + /** + * Shouldn't something like this be built-in to XMLReader? + * Fetches text contents of the current element, assuming + * no sub-elements or such scary things. + * @return string + * @access private + */ + function nodeContents() { + if( $this->reader->isEmptyElement ) { + return ""; + } + $buffer = ""; + while( $this->reader->read() ) { + switch( $this->reader->nodeType ) { + case XMLREADER_TEXT: +// case XMLREADER_WHITESPACE: + case XMLREADER_SIGNIFICANT_WHITESPACE: + $buffer .= $this->reader->value; + break; + case XMLREADER_END_ELEMENT: + return $buffer; + } + } + return $this->close(); + } + + /** + * @access private + */ + function close() { + $this->reader->close(); + $this->reader = null; + $this->atEnd = true; + return false; + } +} + +?> diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php index afa9dba5a4..065eb7e0a7 100644 --- a/maintenance/dumpBackup.php +++ b/maintenance/dumpBackup.php @@ -28,202 +28,7 @@ $optionsWithArgs = array( 'server', 'pagelist', 'start', 'end' ); require_once( 'commandLine.inc' ); require_once( 'SpecialExport.php' ); - -class BackupDumper { - var $reportingInterval = 100; - var $reporting = true; - var $pageCount = 0; - var $revCount = 0; - var $server = null; // use default - var $pages = null; // all pages - var $skipHeader = false; // don't output and - var $skipFooter = false; // don't output - var $startId = 0; - var $endId = 0; - var $sink = null; // Output filters - - function BackupDumper( $args ) { - $this->stderr = fopen( "php://stderr", "wt" ); - $this->sink = $this->processArgs( $args ); - } - - /** - * @param array $args - * @return array - * @static - */ - function processArgs( $args ) { - $outputTypes = array( - 'file' => 'DumpFileOutput', - 'gzip' => 'DumpGZipOutput', - 'bzip2' => 'DumpBZip2Output', - '7zip' => 'Dump7ZipOutput' ); - $filterTypes = array( - 'latest' => 'DumpLatestFilter', - 'notalk' => 'DumpNotalkFilter', - 'namespace' => 'DumpNamespaceFilter' ); - $sink = null; - $sinks = array(); - foreach( $args as $arg ) { - if( preg_match( '/^--(.+?)(?:=(.+?)(?::(.+?))?)?$/', $arg, $matches ) ) { - @list( $full, $opt, $val, $param ) = $matches; - switch( $opt ) { - case "output": - if( !is_null( $sink ) ) { - $sinks[] = $sink; - } - if( !isset( $outputTypes[$val] ) ) { - die( "Unrecognized output sink type '$val'\n" ); - } - $type = $outputTypes[$val]; - $sink = new $type( $param ); - break; - case "filter": - if( is_null( $sink ) ) { - $this->progress( "Warning: assuming stdout for filter output\n" ); - $sink = new DumpOutput(); - } - if( !isset( $filterTypes[$val] ) ) { - die( "Unrecognized filter type '$val'\n" ); - } - $type = $filterTypes[$val]; - $filter = new $type( $sink, $param ); - - // references are lame in php... - unset( $sink ); - $sink = $filter; - - break; - default: - //die( "Unrecognized dump option'$opt'\n" ); - } - } - } - - if( is_null( $sink ) ) { - $sink = new DumpOutput(); - } - $sinks[] = $sink; - - if( count( $sinks ) > 1 ) { - return new DumpMultiWriter( $sinks ); - } else { - return $sink; - } - } - - function dump( $history ) { - # This shouldn't happen if on console... ;) - header( 'Content-type: text/html; charset=UTF-8' ); - - # Notice messages will foul up your XML output even if they're - # relatively harmless. - ini_set( 'display_errors', false ); - - $this->startTime = wfTime(); - - $dbr =& wfGetDB( DB_SLAVE ); - $this->maxCount = $dbr->selectField( 'page', 'MAX(page_id)', '', 'BackupDumper::dump' ); - $this->startTime = wfTime(); - - $db =& $this->backupDb(); - $exporter = new WikiExporter( $db, $history, MW_EXPORT_STREAM ); - - $wrapper = new ExportProgressFilter( $this->sink, $this ); - $exporter->setOutputSink( $wrapper ); - - if( !$this->skipHeader ) - $exporter->openStream(); - - if( is_null( $this->pages ) ) { - if( $this->startId || $this->endId ) { - $exporter->pagesByRange( $this->startId, $this->endId ); - } else { - $exporter->allPages(); - } - } else { - $exporter->pagesByName( $this->pages ); - } - - if( !$this->skipFooter ) - $exporter->closeStream(); - - $this->report( true ); - } - - function &backupDb() { - global $wgDBadminuser, $wgDBadminpassword; - global $wgDBname; - $db =& new Database( $this->backupServer(), $wgDBadminuser, $wgDBadminpassword, $wgDBname ); - $timeout = 3600 * 24; - $db->query( "SET net_read_timeout=$timeout" ); - $db->query( "SET net_write_timeout=$timeout" ); - return $db; - } - - function backupServer() { - global $wgDBserver; - return $this->server - ? $this->server - : $wgDBserver; - } - - function reportPage() { - $this->pageCount++; - $this->report(); - } - - function revCount() { - $this->revCount++; - } - - function report( $final = false ) { - if( $final xor ( $this->pageCount % $this->reportingInterval == 0 ) ) { - $this->showReport(); - } - } - - function showReport() { - if( $this->reporting ) { - $delta = wfTime() - $this->startTime; - $now = wfTimestamp( TS_DB ); - if( $delta ) { - $rate = $this->pageCount / $delta; - $revrate = $this->revCount / $delta; - $portion = $this->pageCount / $this->maxCount; - $eta = $this->startTime + $delta / $portion; - $etats = wfTimestamp( TS_DB, intval( $eta ) ); - } else { - $rate = '-'; - $revrate = '-'; - $etats = '-'; - } - global $wgDBname; - $this->progress( "$now: $wgDBname $this->pageCount, ETA $etats ($rate pages/sec $revrate revs/sec)" ); - } - } - - function progress( $string ) { - fwrite( $this->stderr, $string . "\n" ); - } -} - -class ExportProgressFilter extends DumpFilter { - function ExportProgressFilter( &$sink, &$progress ) { - parent::DumpFilter( $sink ); - $this->progress = $progress; - } - - function writeClosePage( $string ) { - parent::writeClosePage( $string ); - $this->progress->reportPage(); - } - - function writeRevision( $rev, $string ) { - parent::writeRevision( $rev, $string ); - $this->progress->revCount(); - } -} +require_once( 'maintenance/backup.inc' ); $dumper = new BackupDumper( $argv ); @@ -259,10 +64,12 @@ if( isset( $options['end'] ) ) { $dumper->skipHeader = isset( $options['skip-header'] ); $dumper->skipFooter = isset( $options['skip-footer'] ); +$textMode = isset( $options['stub'] ) ? MW_EXPORT_STUB : MW_EXPORT_TEXT; + if( isset( $options['full'] ) ) { - $dumper->dump( MW_EXPORT_FULL ); + $dumper->dump( MW_EXPORT_FULL, $textMode ); } elseif( isset( $options['current'] ) ) { - $dumper->dump( MW_EXPORT_CURRENT ); + $dumper->dump( MW_EXPORT_CURRENT, $textMode ); } else { $dumper->progress( << header --skip-footer Don't output the footer + --stub Don't perform old_text lookups; for 2-pass dump END ); } diff --git a/maintenance/dumpTextPass.php b/maintenance/dumpTextPass.php new file mode 100644 index 0000000000..bb1b194b32 --- /dev/null +++ b/maintenance/dumpTextPass.php @@ -0,0 +1,202 @@ + + * http://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @package MediaWiki + * @subpackage SpecialPage + */ + +$originalDir = getcwd(); + +$optionsWithArgs = array( 'server', 'pagelist', 'start', 'end' ); + +require_once( 'commandLine.inc' ); +require_once( 'SpecialExport.php' ); +require_once( 'maintenance/backup.inc' ); + +class TextPassDumper extends BackupDumper { + var $prefetch = null; + + function dump() { + # This shouldn't happen if on console... ;) + header( 'Content-type: text/html; charset=UTF-8' ); + + # Notice messages will foul up your XML output even if they're + # relatively harmless. +// ini_set( 'display_errors', false ); + + $this->startTime = wfTime(); + + $this->db =& wfGetDB( DB_SLAVE ); + $this->maxCount = $this->db->selectField( 'page', 'MAX(page_id)', '', 'BackupDumper::dump' ); + $this->startTime = wfTime(); + + $this->egress = new ExportProgressFilter( $this->sink, $this ); + + $input = fopen( "php://stdin", "rt" ); + $result = $this->readDump( $input ); + + if( WikiError::isError( $result ) ) { + $this->progress( $result->getMessage() ); + } + + $this->report( true ); + } + + function processOption( $opt, $val, $param ) { + if( $opt == 'prefetch' ) { + require_once 'maintenance/backupPrefetch.inc'; + $this->prefetch = new BaseDump( $val ); + } + } + + function readDump( $input ) { + $this->buffer = ""; + $this->openElement = false; + $this->atStart = true; + $this->state = ""; + $this->lastName = ""; + $this->thisPage = 0; + $this->thisRev = 0; + + $parser = xml_parser_create( "UTF-8" ); + xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); + + xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) ); + xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); + + $offset = 0; // for context extraction on error reporting + $bufferSize = 512 * 1024; + do { + $chunk = fread( $input, $bufferSize ); + if( !xml_parse( $parser, $chunk, feof( $input ) ) ) { + wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); + return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset ); + } + $offset += strlen( $chunk ); + } while( $chunk !== false && !feof( $input ) ); + xml_parser_free( $parser ); + } + + function getText( $id ) { + if( isset( $this->prefetch ) ) { + $text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev ); + if( !is_null( $text ) ) + return $text; + } + $id = intval( $id ); + $row = $this->db->selectRow( 'text', + array( 'old_text', 'old_flags' ), + array( 'old_id' => $id ), + 'TextPassDumper::getText' ); + return UtfNormal::cleanUp( strval( Revision::getRevisionText( $row ) ) ); + } + + function startElement( $parser, $name, $attribs ) { + $this->clearOpenElement( null ); + $this->lastName = $name; + + if( $name == 'revision' ) { + $this->state = $name; + $this->egress->writeOpenPage( null, $this->buffer ); + $this->buffer = ""; + } elseif( $name == 'page' ) { + $this->state = $name; + if( $this->atStart ) { + $this->egress->writeOpenStream( $this->buffer ); + $this->buffer = ""; + $this->atStart = false; + } + } + + if( $name == "text" && isset( $attribs['id'] ) ) { + $text = $this->getText( $attribs['id'] ); + $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); + if( strlen( $text ) > 0 ) { + $this->characterData( $parser, $text ); + } + } else { + $this->openElement = array( $name, $attribs ); + } + } + + function endElement( $parser, $name ) { + if( $this->openElement ) { + $this->clearOpenElement( "" ); + } else { + $this->buffer .= ""; + } + + if( $name == 'revision' ) { + $this->egress->writeRevision( null, $this->buffer ); + $this->buffer = ""; + } elseif( $name == 'page' ) { + $this->egress->writeClosePage( $this->buffer ); + $this->buffer = ""; + } elseif( $name == 'mediawiki' ) { + $this->egress->writeCloseStream( $this->buffer ); + $this->buffer = ""; + } + } + + function characterData( $parser, $data ) { + $this->clearOpenElement( null ); + if( $this->lastName == "id" ) { + if( $this->state == "revision" ) { + $this->thisRev = intval( $data ); + } elseif( $this->state == "page" ) { + $this->thisPage = intval( $data ); + } + } + $this->buffer .= htmlspecialchars( $data ); + } + + function clearOpenElement( $style ) { + if( $this->openElement ) { + $this->buffer .= wfElement( $this->openElement[0], $this->openElement[1], $style ); + $this->openElement = false; + } + } +} + + +$dumper = new TextPassDumper( $argv ); + +if( true ) { + $dumper->dump(); +} else { + $dumper->progress( <<] +Options: + --prefetch Use a prior dump file as a text source where possible. + (Requires PHP 5.0+ and the XMLReader PECL extension) + --quiet Don't dump status reports to stderr. + --report=n Report position and speed after every n pages processed. + (Default: 100) +END +); +} + +?> -- 2.20.1