* Two-pass data dump for friendliness to the DB (--stub, then dumpTextPass.php)
authorBrion Vibber <brion@users.mediawiki.org>
Sun, 16 Oct 2005 17:33:41 +0000 (17:33 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Sun, 16 Oct 2005 17:33:41 +0000 (17:33 +0000)
* Data dump 'prefetch' mode to read normalized text from a prior dump
  (requires PHP 5, XMLReader extension)
Maybe these will keep dammit from killing our dump runs...

RELEASE-NOTES
includes/Export.php
maintenance/backup.inc [new file with mode: 0644]
maintenance/backupPrefetch.inc [new file with mode: 0644]
maintenance/dumpBackup.php
maintenance/dumpTextPass.php [new file with mode: 0644]

index 97ecea5..10421f8 100644 (file)
@@ -152,6 +152,9 @@ fully support the editing toolbar, but was found to be too confusing.
 * Fix for hook callbacks on objects containing no fields
 * (bug 3711) Removed invisible unicode characters from LanguageHu
 * (bug 2330) Don't do funny thinks with "links" in MediaWiki:Undeletedtext
+* Two-pass data dump for friendliness to the DB (--stub, then dumpTextPass.php)
+* Data dump 'prefetch' mode to read normalized text from a prior dump
+  (requires PHP 5, XMLReader extension)
 
 
 === Caveats ===
index 7bbc667..c57831f 100644 (file)
@@ -31,6 +31,9 @@ define( 'MW_EXPORT_CURRENT',  1 );
 define( 'MW_EXPORT_BUFFER',   0 );
 define( 'MW_EXPORT_STREAM',   1 );
 
+define( 'MW_EXPORT_TEXT',     0 );
+define( 'MW_EXPORT_STUB',     1 );
+
 
 /**
  * @package MediaWiki
@@ -49,12 +52,13 @@ class WikiExporter {
         * @param int $buffer one of MW_EXPORT_BUFFER or MW_EXPORT_STREAM
         */
        function WikiExporter( &$db, $history = MW_EXPORT_CURRENT,
-                       $buffer = MW_EXPORT_BUFFER ) {
+                       $buffer = MW_EXPORT_BUFFER, $text = MW_EXPORT_TEXT ) {
                $this->db =& $db;
                $this->history = $history;
                $this->buffer  = $buffer;
                $this->writer  = new XmlDumpWriter();
                $this->sink    = new DumpOutput();
+               $this->text    = $text;
        }
        
        /**
@@ -158,13 +162,21 @@ class WikiExporter {
                        $pageindex = '';
                        $revindex = '';
                }
-               $result = $this->db->query(
-                       "SELECT * FROM
-                               $page $pageindex,
-                               $revision $revindex,
-                               $text
-                               WHERE $where $join AND rev_text_id=old_id
-                               ORDER BY page_id", $fname );
+               if( $this->text == MW_EXPORT_STUB ) {
+                       $sql = "SELECT * FROM
+                                       $page $pageindex,
+                                       $revision $revindex
+                                       WHERE $where $join
+                                       ORDER BY page_id";
+               } else {
+                       $sql = "SELECT * FROM
+                                       $page $pageindex,
+                                       $revision $revindex,
+                                       $text
+                                       WHERE $where $join AND rev_text_id=old_id
+                                       ORDER BY page_id";
+               }
+               $result = $this->db->query( $sql, $fname );
                $wrapper = $this->db->resultObject( $result );
                $this->outputStream( $wrapper );
                
@@ -363,11 +375,19 @@ class XmlDumpWriter {
                if( $row->rev_comment != '' ) {
                        $out .= "      " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n";
                }
-       
-               $text = strval( Revision::getRevisionText( $row ) );
-               $out .= "      " . wfElementClean( 'text',
-                       array( 'xml:space' => 'preserve' ),
-                       strval( $text ) ) . "\n";
+               
+               if( isset( $row->old_text ) ) {
+                       // Raw text from the database may have invalid chars
+                       $text = strval( Revision::getRevisionText( $row ) );
+                       $out .= "      " . wfElementClean( 'text',
+                               array( 'xml:space' => 'preserve' ),
+                               strval( $text ) ) . "\n";
+               } else {
+                       // Stub output
+                       $out .= "      " . wfElement( 'text',
+                               array( 'id' => $row->rev_text_id ),
+                               "" ) . "\n";
+               }
                
                $out .= "    </revision>\n";
                
diff --git a/maintenance/backup.inc b/maintenance/backup.inc
new file mode 100644 (file)
index 0000000..df98218
--- /dev/null
@@ -0,0 +1,227 @@
+<?php
+/**
+ * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
+ * http://www.mediawiki.org/
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or 
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @package MediaWiki
+ * @subpackage SpecialPage
+ */
+
+
+class BackupDumper {
+       var $reportingInterval = 100;
+       var $reporting = true;
+       var $pageCount = 0;
+       var $revCount  = 0;
+       var $server    = null; // use default
+       var $pages     = null; // all pages
+       var $skipHeader = false; // don't output <mediawiki> and <siteinfo>
+       var $skipFooter = false; // don't output </mediawiki>
+       var $startId    = 0;
+       var $endId      = 0;
+       var $sink       = null; // Output filters
+       var $stubText   = false; // include rev_text_id instead of text; for 2-pass dump
+       
+       function BackupDumper( $args ) {
+               $this->stderr = fopen( "php://stderr", "wt" );
+               $this->sink = $this->processArgs( $args );
+       }
+       
+       /**
+        * @param array $args
+        * @return array
+        * @static
+        */
+       function processArgs( $args ) {
+               $outputTypes = array(
+                       'file'  => 'DumpFileOutput',
+                       'gzip'  => 'DumpGZipOutput',
+                       'bzip2' => 'DumpBZip2Output',
+                       '7zip'  => 'Dump7ZipOutput' );
+               $filterTypes = array(
+                       'latest'    => 'DumpLatestFilter',
+                       'notalk'    => 'DumpNotalkFilter',
+                       'namespace' => 'DumpNamespaceFilter' );
+               $sink = null;
+               $sinks = array();
+               foreach( $args as $arg ) {
+                       if( preg_match( '/^--(.+?)(?:=(.+?)(?::(.+?))?)?$/', $arg, $matches ) ) {
+                               @list( $full, $opt, $val, $param ) = $matches;
+                               switch( $opt ) {
+                               case "output":
+                                       if( !is_null( $sink ) ) {
+                                               $sinks[] = $sink;
+                                       }
+                                       if( !isset( $outputTypes[$val] ) ) {
+                                               die( "Unrecognized output sink type '$val'\n" );
+                                       }
+                                       $type = $outputTypes[$val];
+                                       $sink = new $type( $param );
+                                       break;
+                               case "filter":
+                                       if( is_null( $sink ) ) {
+                                               $this->progress( "Warning: assuming stdout for filter output\n" );
+                                               $sink = new DumpOutput();
+                                       }
+                                       if( !isset( $filterTypes[$val] ) ) {
+                                               die( "Unrecognized filter type '$val'\n" );
+                                       }
+                                       $type = $filterTypes[$val];
+                                       $filter = new $type( $sink, $param );
+                                       
+                                       // references are lame in php...
+                                       unset( $sink );
+                                       $sink = $filter;
+                                       
+                                       break;
+                               default:
+                                       $this->processOption( $opt, $val, $param );
+                               }
+                       }
+               }
+               
+               if( is_null( $sink ) ) {
+                       $sink = new DumpOutput();
+               }
+               $sinks[] = $sink;
+               
+               if( count( $sinks ) > 1 ) {
+                       return new DumpMultiWriter( $sinks );
+               } else {
+                       return $sink;
+               }
+       }
+       
+       function processOption( $opt, $val, $param ) {
+               // extension point for subclasses to add options
+       }
+       
+       function dump( $history, $text = MW_EXPORT_TEXT ) {
+               # This shouldn't happen if on console... ;)
+               header( 'Content-type: text/html; charset=UTF-8' );
+               
+               # Notice messages will foul up your XML output even if they're
+               # relatively harmless.
+               ini_set( 'display_errors', false );
+               
+               $this->startTime = wfTime();
+               
+               $dbr =& wfGetDB( DB_SLAVE );
+               $this->maxCount = $dbr->selectField( 'page', 'MAX(page_id)', '', 'BackupDumper::dump' );
+               $this->startTime = wfTime();
+               
+               $db =& $this->backupDb();
+               $exporter = new WikiExporter( $db, $history, MW_EXPORT_STREAM, $text );
+               
+               $wrapper = new ExportProgressFilter( $this->sink, $this );
+               $exporter->setOutputSink( $wrapper );
+               
+               if( !$this->skipHeader )
+                       $exporter->openStream();
+
+               if( is_null( $this->pages ) ) {
+                       if( $this->startId || $this->endId ) {
+                               $exporter->pagesByRange( $this->startId, $this->endId );
+                       } else {
+                               $exporter->allPages();
+                       }
+               } else {
+                       $exporter->pagesByName( $this->pages );
+               }
+
+               if( !$this->skipFooter )
+                       $exporter->closeStream();
+               
+               $this->report( true );
+       }
+       
+       function &backupDb() {
+               global $wgDBadminuser, $wgDBadminpassword;
+               global $wgDBname;
+               $db =& new Database( $this->backupServer(), $wgDBadminuser, $wgDBadminpassword, $wgDBname );
+               $timeout = 3600 * 24;
+               $db->query( "SET net_read_timeout=$timeout" );
+               $db->query( "SET net_write_timeout=$timeout" );
+               return $db;
+       }
+       
+       function backupServer() {
+               global $wgDBserver;
+               return $this->server
+                       ? $this->server
+                       : $wgDBserver;
+       }
+
+       function reportPage() {
+               $this->pageCount++;
+               $this->report();
+       }
+       
+       function revCount() {
+               $this->revCount++;
+       }
+       
+       function report( $final = false ) {
+               if( $final xor ( $this->pageCount % $this->reportingInterval == 0 ) ) {
+                       $this->showReport();
+               }
+       }
+       
+       function showReport() {
+               if( $this->reporting ) {
+                       $delta = wfTime() - $this->startTime;
+                       $now = wfTimestamp( TS_DB );
+                       if( $delta ) {
+                               $rate = $this->pageCount / $delta;
+                               $revrate = $this->revCount / $delta;
+                               $portion = $this->pageCount / $this->maxCount;
+                               $eta = $this->startTime + $delta / $portion;
+                               $etats = wfTimestamp( TS_DB, intval( $eta ) );
+                       } else {
+                               $rate = '-';
+                               $revrate = '-';
+                               $etats = '-';
+                       }
+                       global $wgDBname;
+                       $this->progress( "$now: $wgDBname $this->pageCount, ETA $etats ($rate pages/sec $revrate revs/sec)" );
+               }
+       }
+       
+       function progress( $string ) {
+               fwrite( $this->stderr, $string . "\n" );
+       }
+}
+
+class ExportProgressFilter extends DumpFilter {
+       function ExportProgressFilter( &$sink, &$progress ) {
+               parent::DumpFilter( $sink );
+               $this->progress = $progress;
+       }
+
+       function writeClosePage( $string ) {
+               parent::writeClosePage( $string );
+               $this->progress->reportPage();
+       }
+       
+       function writeRevision( $rev, $string ) {
+               parent::writeRevision( $rev, $string );
+               $this->progress->revCount();
+       }
+}
+
+?>
diff --git a/maintenance/backupPrefetch.inc b/maintenance/backupPrefetch.inc
new file mode 100644 (file)
index 0000000..ddb0656
--- /dev/null
@@ -0,0 +1,141 @@
+<?php
+
+/**
+ * Readahead helper for making large MediaWiki data dumps;
+ * reads in a previous XML dump to sequentially prefetch text
+ * records already normalized and decompressed.
+ *
+ * This can save load on the external database servers, hopefully.
+ *
+ * Assumes that dumps will be recorded in the canonical order:
+ * - ascending by page_id
+ * - ascending by rev_id within each page
+ * - text contents are immutable and should not change once
+ *   recorded, so the previous dump is a reliable source
+ *
+ * Requires PHP 5 and the XMLReader PECL extension.
+ */
+class BaseDump {
+       var $reader = null;
+       var $atEnd = false;
+       var $lastPage = 0;
+       var $lastRev = 0;
+       
+       function BaseDump( $infile ) {
+               $this->reader = new XMLReader();
+               $this->reader->open( $infile );
+       }
+       
+       /**
+        * Attempts to fetch the text of a particular page revision
+        * from the dump stream. May return null if the page is
+        * unavailable.
+        *
+        * @param int $page ID number of page to read
+        * @param int $rev ID number of revision to read
+        * @return string or null
+        */
+       function prefetch( $page, $rev ) {
+               while( $this->lastPage < $page && !$this->atEnd ) {
+                       $this->nextPage();
+               }
+               if( $this->lastPage > $page || $this->atEnd ) {
+                       $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev\n" );
+                       return null;
+               }
+               while( $this->lastRev < $rev && !$this->atEnd ) {
+                       $this->nextRev();
+               }
+               if( $this->lastRev == $rev ) {
+                       $this->debug( "BaseDump::prefetch hit on $page, $rev\n" );
+                       return $this->nextText();
+               } else {
+                       $this->debug( "BaseDump::prefetch already past rev $rev on page $page\n" );
+                       return null;
+               }
+       }
+       
+       function debug( $str ) {
+               wfDebug( $str );
+               //global $dumper;
+               //$dumper->progress( $str );
+       }
+       
+       /**
+        * @access private
+        */
+       function nextPage() {
+               $this->skipTo( 'page' );
+               $this->skipTo( 'id' );
+               $this->lastPage = intval( $this->nodeContents() );
+               $this->lastRev = 0;
+       }
+       
+       /**
+        * @access private
+        */
+       function nextRev() {
+               $this->skipTo( 'revision' );
+               $this->skipTo( 'id' );
+               $this->lastRev = intval( $this->nodeContents() );
+       }
+       
+       /**
+        * @access private
+        */
+       function nextText() {
+               $this->skipTo( 'text' );
+               return strval( $this->nodeContents() );
+       }
+       
+       /**
+        * @access private
+        */
+       function skipTo( $name ) {
+               while( $this->reader->read() ) {
+                       if( $this->reader->nodeType == XMLREADER_ELEMENT &&
+                               $this->reader->name == $name ) {
+                               return true;
+                       }
+               }
+               return $this->close();
+       }
+       
+       /**
+        * Shouldn't something like this be built-in to XMLReader?
+        * Fetches text contents of the current element, assuming
+        * no sub-elements or such scary things.
+        * @return string
+        * @access private
+        */
+       function nodeContents() {
+               if( $this->reader->isEmptyElement ) {
+                       return "";
+               }
+               $buffer = "";
+               while( $this->reader->read() ) {
+                       switch( $this->reader->nodeType ) {
+                       case XMLREADER_TEXT:
+//                     case XMLREADER_WHITESPACE:
+                       case XMLREADER_SIGNIFICANT_WHITESPACE:
+                               $buffer .= $this->reader->value;
+                               break;
+                       case XMLREADER_END_ELEMENT:
+                               return $buffer;
+                       }
+               }
+               return $this->close();
+       }
+       
+       /**
+        * @access private
+        */
+       function close() {
+               $this->reader->close();
+               $this->reader = null;
+               $this->atEnd = true;
+               return false;
+       }
+}
+
+?>
index afa9dba..065eb7e 100644 (file)
@@ -28,202 +28,7 @@ $optionsWithArgs = array( 'server', 'pagelist', 'start', 'end' );
 
 require_once( 'commandLine.inc' );
 require_once( 'SpecialExport.php' );
-
-class BackupDumper {
-       var $reportingInterval = 100;
-       var $reporting = true;
-       var $pageCount = 0;
-       var $revCount  = 0;
-       var $server    = null; // use default
-       var $pages     = null; // all pages
-       var $skipHeader = false; // don't output <mediawiki> and <siteinfo>
-       var $skipFooter = false; // don't output </mediawiki>
-       var $startId    = 0;
-       var $endId      = 0;
-       var $sink       = null; // Output filters
-       
-       function BackupDumper( $args ) {
-               $this->stderr = fopen( "php://stderr", "wt" );
-               $this->sink = $this->processArgs( $args );
-       }
-       
-       /**
-        * @param array $args
-        * @return array
-        * @static
-        */
-       function processArgs( $args ) {
-               $outputTypes = array(
-                       'file'  => 'DumpFileOutput',
-                       'gzip'  => 'DumpGZipOutput',
-                       'bzip2' => 'DumpBZip2Output',
-                       '7zip'  => 'Dump7ZipOutput' );
-               $filterTypes = array(
-                       'latest'    => 'DumpLatestFilter',
-                       'notalk'    => 'DumpNotalkFilter',
-                       'namespace' => 'DumpNamespaceFilter' );
-               $sink = null;
-               $sinks = array();
-               foreach( $args as $arg ) {
-                       if( preg_match( '/^--(.+?)(?:=(.+?)(?::(.+?))?)?$/', $arg, $matches ) ) {
-                               @list( $full, $opt, $val, $param ) = $matches;
-                               switch( $opt ) {
-                               case "output":
-                                       if( !is_null( $sink ) ) {
-                                               $sinks[] = $sink;
-                                       }
-                                       if( !isset( $outputTypes[$val] ) ) {
-                                               die( "Unrecognized output sink type '$val'\n" );
-                                       }
-                                       $type = $outputTypes[$val];
-                                       $sink = new $type( $param );
-                                       break;
-                               case "filter":
-                                       if( is_null( $sink ) ) {
-                                               $this->progress( "Warning: assuming stdout for filter output\n" );
-                                               $sink = new DumpOutput();
-                                       }
-                                       if( !isset( $filterTypes[$val] ) ) {
-                                               die( "Unrecognized filter type '$val'\n" );
-                                       }
-                                       $type = $filterTypes[$val];
-                                       $filter = new $type( $sink, $param );
-                                       
-                                       // references are lame in php...
-                                       unset( $sink );
-                                       $sink = $filter;
-                                       
-                                       break;
-                               default:
-                                       //die( "Unrecognized dump option'$opt'\n" );
-                               }
-                       }
-               }
-               
-               if( is_null( $sink ) ) {
-                       $sink = new DumpOutput();
-               }
-               $sinks[] = $sink;
-               
-               if( count( $sinks ) > 1 ) {
-                       return new DumpMultiWriter( $sinks );
-               } else {
-                       return $sink;
-               }
-       }
-       
-       function dump( $history ) {
-               # This shouldn't happen if on console... ;)
-               header( 'Content-type: text/html; charset=UTF-8' );
-               
-               # Notice messages will foul up your XML output even if they're
-               # relatively harmless.
-               ini_set( 'display_errors', false );
-               
-               $this->startTime = wfTime();
-               
-               $dbr =& wfGetDB( DB_SLAVE );
-               $this->maxCount = $dbr->selectField( 'page', 'MAX(page_id)', '', 'BackupDumper::dump' );
-               $this->startTime = wfTime();
-               
-               $db =& $this->backupDb();
-               $exporter = new WikiExporter( $db, $history, MW_EXPORT_STREAM );
-               
-               $wrapper = new ExportProgressFilter( $this->sink, $this );
-               $exporter->setOutputSink( $wrapper );
-               
-               if( !$this->skipHeader )
-                       $exporter->openStream();
-
-               if( is_null( $this->pages ) ) {
-                       if( $this->startId || $this->endId ) {
-                               $exporter->pagesByRange( $this->startId, $this->endId );
-                       } else {
-                               $exporter->allPages();
-                       }
-               } else {
-                       $exporter->pagesByName( $this->pages );
-               }
-
-               if( !$this->skipFooter )
-                       $exporter->closeStream();
-               
-               $this->report( true );
-       }
-       
-       function &backupDb() {
-               global $wgDBadminuser, $wgDBadminpassword;
-               global $wgDBname;
-               $db =& new Database( $this->backupServer(), $wgDBadminuser, $wgDBadminpassword, $wgDBname );
-               $timeout = 3600 * 24;
-               $db->query( "SET net_read_timeout=$timeout" );
-               $db->query( "SET net_write_timeout=$timeout" );
-               return $db;
-       }
-       
-       function backupServer() {
-               global $wgDBserver;
-               return $this->server
-                       ? $this->server
-                       : $wgDBserver;
-       }
-
-       function reportPage() {
-               $this->pageCount++;
-               $this->report();
-       }
-       
-       function revCount() {
-               $this->revCount++;
-       }
-       
-       function report( $final = false ) {
-               if( $final xor ( $this->pageCount % $this->reportingInterval == 0 ) ) {
-                       $this->showReport();
-               }
-       }
-       
-       function showReport() {
-               if( $this->reporting ) {
-                       $delta = wfTime() - $this->startTime;
-                       $now = wfTimestamp( TS_DB );
-                       if( $delta ) {
-                               $rate = $this->pageCount / $delta;
-                               $revrate = $this->revCount / $delta;
-                               $portion = $this->pageCount / $this->maxCount;
-                               $eta = $this->startTime + $delta / $portion;
-                               $etats = wfTimestamp( TS_DB, intval( $eta ) );
-                       } else {
-                               $rate = '-';
-                               $revrate = '-';
-                               $etats = '-';
-                       }
-                       global $wgDBname;
-                       $this->progress( "$now: $wgDBname $this->pageCount, ETA $etats ($rate pages/sec $revrate revs/sec)" );
-               }
-       }
-       
-       function progress( $string ) {
-               fwrite( $this->stderr, $string . "\n" );
-       }
-}
-
-class ExportProgressFilter extends DumpFilter {
-       function ExportProgressFilter( &$sink, &$progress ) {
-               parent::DumpFilter( $sink );
-               $this->progress = $progress;
-       }
-
-       function writeClosePage( $string ) {
-               parent::writeClosePage( $string );
-               $this->progress->reportPage();
-       }
-       
-       function writeRevision( $rev, $string ) {
-               parent::writeRevision( $rev, $string );
-               $this->progress->revCount();
-       }
-}
+require_once( 'maintenance/backup.inc' );
 
 $dumper = new BackupDumper( $argv );
 
@@ -259,10 +64,12 @@ if( isset( $options['end'] ) ) {
 $dumper->skipHeader = isset( $options['skip-header'] );
 $dumper->skipFooter = isset( $options['skip-footer'] );
 
+$textMode = isset( $options['stub'] ) ? MW_EXPORT_STUB : MW_EXPORT_TEXT;
+
 if( isset( $options['full'] ) ) {
-       $dumper->dump( MW_EXPORT_FULL );
+       $dumper->dump( MW_EXPORT_FULL, $textMode );
 } elseif( isset( $options['current'] ) ) {
-       $dumper->dump( MW_EXPORT_CURRENT );
+       $dumper->dump( MW_EXPORT_CURRENT, $textMode );
 } else {
        $dumper->progress( <<<END
 This script dumps the wiki page database into an XML interchange wrapper
@@ -284,6 +91,7 @@ Options:
   --end=n     Stop before page_id n (exclusive)
   --skip-header Don't output the <mediawiki> header
   --skip-footer Don't output the </mediawiki> footer
+  --stub      Don't perform old_text lookups; for 2-pass dump
 END
 );
 }
diff --git a/maintenance/dumpTextPass.php b/maintenance/dumpTextPass.php
new file mode 100644 (file)
index 0000000..bb1b194
--- /dev/null
@@ -0,0 +1,202 @@
+<?php
+/**
+ * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
+ * http://www.mediawiki.org/
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or 
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @package MediaWiki
+ * @subpackage SpecialPage
+ */
+
+$originalDir = getcwd();
+
+$optionsWithArgs = array( 'server', 'pagelist', 'start', 'end' );
+
+require_once( 'commandLine.inc' );
+require_once( 'SpecialExport.php' );
+require_once( 'maintenance/backup.inc' );
+
+class TextPassDumper extends BackupDumper {
+       var $prefetch = null;
+       
+       function dump() {
+               # This shouldn't happen if on console... ;)
+               header( 'Content-type: text/html; charset=UTF-8' );
+               
+               # Notice messages will foul up your XML output even if they're
+               # relatively harmless.
+//             ini_set( 'display_errors', false );
+               
+               $this->startTime = wfTime();
+               
+               $this->db =& wfGetDB( DB_SLAVE );
+               $this->maxCount = $this->db->selectField( 'page', 'MAX(page_id)', '', 'BackupDumper::dump' );
+               $this->startTime = wfTime();
+               
+               $this->egress = new ExportProgressFilter( $this->sink, $this );
+
+               $input = fopen( "php://stdin", "rt" );
+               $result = $this->readDump( $input );
+               
+               if( WikiError::isError( $result ) ) {
+                       $this->progress( $result->getMessage() );
+               }
+               
+               $this->report( true );
+       }
+       
+       function processOption( $opt, $val, $param ) {
+               if( $opt == 'prefetch' ) {
+                       require_once 'maintenance/backupPrefetch.inc';
+                       $this->prefetch = new BaseDump( $val );
+               }
+       }
+       
+       function readDump( $input ) {
+               $this->buffer = "";
+               $this->openElement = false;
+               $this->atStart = true;
+               $this->state = "";
+               $this->lastName = "";
+               $this->thisPage = 0;
+               $this->thisRev = 0;
+               
+               $parser = xml_parser_create( "UTF-8" );
+               xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
+               
+               xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
+               xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
+               
+               $offset = 0; // for context extraction on error reporting
+               $bufferSize = 512 * 1024;
+               do {
+                       $chunk = fread( $input, $bufferSize );
+                       if( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
+                               wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
+                               return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
+                       }
+                       $offset += strlen( $chunk );
+               } while( $chunk !== false && !feof( $input ) );
+               xml_parser_free( $parser );
+       }
+       
+       function getText( $id ) {
+               if( isset( $this->prefetch ) ) {
+                       $text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev );
+                       if( !is_null( $text ) )
+                               return $text;
+               }
+               $id = intval( $id );
+               $row = $this->db->selectRow( 'text',
+                       array( 'old_text', 'old_flags' ),
+                       array( 'old_id' => $id ),
+                       'TextPassDumper::getText' );
+               return UtfNormal::cleanUp( strval( Revision::getRevisionText( $row ) ) );
+       }
+       
+       function startElement( $parser, $name, $attribs ) {
+               $this->clearOpenElement( null );
+               $this->lastName = $name;
+               
+               if( $name == 'revision' ) {
+                       $this->state = $name;
+                       $this->egress->writeOpenPage( null, $this->buffer );
+                       $this->buffer = "";
+               } elseif( $name == 'page' ) {
+                       $this->state = $name;
+                       if( $this->atStart ) {
+                               $this->egress->writeOpenStream( $this->buffer );
+                               $this->buffer = "";
+                               $this->atStart = false;
+                       }
+               }
+               
+               if( $name == "text" && isset( $attribs['id'] ) ) {
+                       $text = $this->getText( $attribs['id'] );
+                       $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
+                       if( strlen( $text ) > 0 ) {
+                               $this->characterData( $parser, $text );
+                       }
+               } else {
+                       $this->openElement = array( $name, $attribs );
+               }
+       }
+       
+       function endElement( $parser, $name ) {
+               if( $this->openElement ) {
+                       $this->clearOpenElement( "" );
+               } else {
+                       $this->buffer .= "</$name>";
+               }
+               
+               if( $name == 'revision' ) {
+                       $this->egress->writeRevision( null, $this->buffer );
+                       $this->buffer = "";
+               } elseif( $name == 'page' ) {
+                       $this->egress->writeClosePage( $this->buffer );
+                       $this->buffer = "";
+               } elseif( $name == 'mediawiki' ) {
+                       $this->egress->writeCloseStream( $this->buffer );
+                       $this->buffer = "";
+               }
+       }
+       
+       function characterData( $parser, $data ) {
+               $this->clearOpenElement( null );
+               if( $this->lastName == "id" ) {
+                       if( $this->state == "revision" ) {
+                               $this->thisRev = intval( $data );
+                       } elseif( $this->state == "page" ) {
+                               $this->thisPage = intval( $data );
+                       }
+               }
+               $this->buffer .= htmlspecialchars( $data );
+       }
+       
+       function clearOpenElement( $style ) {
+               if( $this->openElement ) {
+                       $this->buffer .= wfElement( $this->openElement[0], $this->openElement[1], $style );
+                       $this->openElement = false;
+               }
+       }
+}
+
+
+$dumper = new TextPassDumper( $argv );
+
+if( true ) {
+       $dumper->dump();
+} else {
+       $dumper->progress( <<<END
+This script postprocesses XML dumps from dumpBackup.php to add
+page text which was stubbed out (using --stub).
+
+XML input is accepted on stdin.
+XML output is sent to stdout; progress reports are sent to stderr.
+
+Usage: php dumpTextPass.php [<options>]
+Options:
+  --prefetch <file>  Use a prior dump file as a text source where possible.
+              (Requires PHP 5.0+ and the XMLReader PECL extension)
+  --quiet     Don't dump status reports to stderr.
+  --report=n  Report position and speed after every n pages processed.
+              (Default: 100)
+END
+);
+}
+
+?>