From 6adbb42bf147465145d0f817028176b0da58db35 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 2 Oct 2005 04:05:40 +0000 Subject: [PATCH] * Added filter options, compression piping, and multiple output streams for dumpBackup.php --- RELEASE-NOTES | 2 + includes/Export.php | 675 +++++++++++++++++++++++++++++++++++++ includes/SpecialExport.php | 375 +-------------------- maintenance/dumpBackup.php | 98 +++++- 4 files changed, 770 insertions(+), 380 deletions(-) create mode 100644 includes/Export.php diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 76cad8f3cb..fa797aae8a 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -124,6 +124,8 @@ fully support the editing toolbar, but was found to be too confusing. * (bug 3503) Update LanguageSq.php from sq.wikipedia.org messages * Added EditFilter hook, and output callback on EditPage::showEditForm() for a place to add in captcha-type extensions in the edit flow +* Added filter options, compression piping, and multiple output streams for + dumpBackup.php === Caveats === diff --git a/includes/Export.php b/includes/Export.php new file mode 100644 index 0000000000..7f34a80f0a --- /dev/null +++ b/includes/Export.php @@ -0,0 +1,675 @@ + +# http://www.mediawiki.org/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# http://www.gnu.org/copyleft/gpl.html +/** + * + * @package MediaWiki + * @subpackage SpecialPage + */ + +/** */ +require_once( 'Revision.php' ); + +define( 'MW_EXPORT_FULL', 0 ); +define( 'MW_EXPORT_CURRENT', 1 ); + +define( 'MW_EXPORT_BUFFER', 0 ); +define( 'MW_EXPORT_STREAM', 1 ); + + +/** + * @package MediaWiki + * @subpackage SpecialPage + */ +class WikiExporter { + /** + * If using MW_EXPORT_STREAM to stream a large amount of data, + * provide a database connection which is not managed by + * LoadBalancer to read from: some history blob types will + * make additional queries to pull source data while the + * main query is still running. + * + * @param Database $db + * @param int $history one of MW_EXPORT_FULL or MW_EXPORT_CURRENT + * @param int $buffer one of MW_EXPORT_BUFFER or MW_EXPORT_STREAM + */ + function WikiExporter( &$db, $history = MW_EXPORT_CURRENT, + $buffer = MW_EXPORT_BUFFER ) { + $this->db =& $db; + $this->history = $history; + $this->buffer = $buffer; + $this->writer = new XmlDumpWriter(); + $this->sink = new DumpOutput(); + } + + /** + * Set the DumpOutput or DumpFilter object which will receive + * various row objects and XML output for filtering. Filters + * can be chained or used as callbacks. + * + * @param mixed $callback + */ + function setOutputSink( &$sink ) { + $this->sink =& $sink; + } + + function openStream() { + $output = $this->writer->openStream(); + $this->sink->writeOpenStream( $output ); + } + + function closeStream() { + $output = $this->writer->closeStream(); + $this->sink->writeCloseStream( $output ); + } + + /** + * Dumps a series of page and revision records for all pages + * in the database, either including complete history or only + * the most recent version. + */ + function allPages() { + return $this->dumpFrom( '' ); + } + + /** + * Dumps a series of page and revision records for those pages + * in the database falling within the page_id range given. + * @param int $start Inclusive lower limit (this id is included) + * @param int $end Exclusive upper limit (this id is not included) + * If 0, no upper limit. + */ + function pagesByRange( $start, $end ) { + $condition = 'page_id >= ' . intval( $start ); + if( $end ) { + $condition .= ' AND page_id < ' . intval( $end ); + } + return $this->dumpFrom( $condition ); + } + + /** + * @param Title $title + */ + function pageByTitle( $title ) { + return $this->dumpFrom( + 'page_namespace=' . $title->getNamespace() . + ' AND page_title=' . $this->db->addQuotes( $title->getDbKey() ) ); + } + + function pageByName( $name ) { + $title = Title::newFromText( $name ); + if( is_null( $title ) ) { + return new WikiError( "Can't export invalid title" ); + } else { + return $this->pageByTitle( $title ); + } + } + + function pagesByName( $names ) { + foreach( $names as $name ) { + $this->pageByName( $name ); + } + } + + + // -------------------- private implementation below -------------------- + + function dumpFrom( $cond = '' ) { + $fname = 'WikiExporter::dumpFrom'; + wfProfileIn( $fname ); + + $page = $this->db->tableName( 'page' ); + $revision = $this->db->tableName( 'revision' ); + $text = $this->db->tableName( 'text' ); + + if( $this->history == MW_EXPORT_FULL ) { + $join = 'page_id=rev_page'; + } elseif( $this->history == MW_EXPORT_CURRENT ) { + $join = 'page_id=rev_page AND page_latest=rev_id'; + } else { + wfProfileOut( $fname ); + return new WikiError( "$fname given invalid history dump type." ); + } + $where = ( $cond == '' ) ? '' : "$cond AND"; + + if( $this->buffer == MW_EXPORT_STREAM ) { + $prev = $this->db->bufferResults( false ); + } + if( $cond == '' ) { + // Optimization hack for full-database dump + $pageindex = 'FORCE INDEX (PRIMARY)'; + $revindex = 'FORCE INDEX(page_timestamp)'; + } else { + $pageindex = ''; + $revindex = ''; + } + $result = $this->db->query( + "SELECT * FROM + $page $pageindex, + $revision $revindex, + $text + WHERE $where $join AND rev_text_id=old_id + ORDER BY page_id", $fname ); + $wrapper = $this->db->resultObject( $result ); + $this->outputStream( $wrapper ); + + if( $this->buffer == MW_EXPORT_STREAM ) { + $this->db->bufferResults( $prev ); + } + + wfProfileOut( $fname ); + } + + /** + * Runs through a query result set dumping page and revision records. + * The result set should be sorted/grouped by page to avoid duplicate + * page records in the output. + * + * The result set will be freed once complete. Should be safe for + * streaming (non-buffered) queries, as long as it was made on a + * separate database connection not managed by LoadBalancer; some + * blob storage types will make queries to pull source data. + * + * @param ResultWrapper $resultset + * @access private + */ + function outputStream( $resultset ) { + $last = null; + while( $row = $resultset->fetchObject() ) { + if( is_null( $last ) || + $last->page_namespace != $row->page_namespace || + $last->page_title != $row->page_title ) { + if( isset( $last ) ) { + $output = $this->writer->closePage(); + $this->sink->writeClosePage( $output ); + } + $output = $this->writer->openPage( $row ); + $this->sink->writeOpenPage( $row, $output ); + $last = $row; + } + $output = $this->writer->writeRevision( $row ); + $this->sink->writeRevision( $row, $output ); + } + if( isset( $last ) ) { + $output = $this->writer->closePage(); + $this->sink->writeClosePage( $output ); + } + $resultset->free(); + } +} + +class XmlDumpWriter { + + /** + * Returns the export schema version. + * @return string + */ + function schemaVersion() { + return "0.3"; + } + + /** + * Opens the XML output stream's root element. + * This does not include an xml directive, so is safe to include + * as a subelement in a larger XML stream. Namespace and XML Schema + * references are included. + * + * Output will be encoded in UTF-8. + * + * @return string + */ + function openStream() { + global $wgContLanguageCode; + $ver = $this->schemaVersion(); + return wfElement( 'mediawiki', array( + 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/", + 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance", + 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " . + "http://www.mediawiki.org/xml/export-$ver.xsd", + 'version' => $ver, + 'xml:lang' => $wgContLanguageCode ), + null ) . + "\n" . + $this->siteInfo(); + } + + function siteInfo() { + $info = array( + $this->sitename(), + $this->homelink(), + $this->generator(), + $this->caseSetting(), + $this->namespaces() ); + return " \n " . + implode( "\n ", $info ) . + "\n \n"; + } + + function sitename() { + global $wgSitename; + return wfElement( 'sitename', array(), $wgSitename ); + } + + function generator() { + global $wgVersion; + return wfElement( 'generator', array(), "MediaWiki $wgVersion" ); + } + + function homelink() { + $page = Title::newFromText( wfMsgForContent( 'mainpage' ) ); + return wfElement( 'base', array(), $page->getFullUrl() ); + } + + function caseSetting() { + global $wgCapitalLinks; + // "case-insensitive" option is reserved for future + $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive'; + return wfElement( 'case', array(), $sensitivity ); + } + + function namespaces() { + global $wgContLang; + $spaces = " \n"; + foreach( $wgContLang->getFormattedNamespaces() as $ns => $title ) { + $spaces .= ' ' . wfElement( 'namespace', array( 'key' => $ns ), $title ) . "\n"; + } + $spaces .= " "; + return $spaces; + } + + /** + * Closes the output stream with the closing root element. + * Call when finished dumping things. + */ + function closeStream() { + return "\n"; + } + + + /** + * Opens a section on the output stream, with data + * from the given database row. + * + * @param object $row + * @return string + * @access private + */ + function openPage( $row ) { + $out = " \n"; + $title = Title::makeTitle( $row->page_namespace, $row->page_title ); + $out .= ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n"; + $out .= ' ' . wfElement( 'id', array(), strval( $row->page_id ) ) . "\n"; + if( '' != $row->page_restrictions ) { + $out .= ' ' . wfElement( 'restrictions', array(), + strval( $row->page_restrictions ) ) . "\n"; + } + return $out; + } + + /** + * Closes a section on the output stream. + * + * @access private + */ + function closePage() { + return " \n"; + } + + /** + * Dumps a section on the output stream, with + * data filled in from the given database row. + * + * @param object $row + * @return string + * @access private + */ + function writeRevision( $row ) { + $fname = 'WikiExporter::dumpRev'; + wfProfileIn( $fname ); + + $out = " \n"; + $out .= " " . wfElement( 'id', null, strval( $row->rev_id ) ) . "\n"; + + $ts = wfTimestamp2ISO8601( strval( $row->rev_timestamp ) ); + $out .= " " . wfElement( 'timestamp', null, $ts ) . "\n"; + + $out .= " \n"; + if( $row->rev_user ) { + $out .= " " . wfElementClean( 'username', null, strval( $row->rev_user_text ) ) . "\n"; + $out .= " " . wfElement( 'id', null, strval( $row->rev_user ) ) . "\n"; + } else { + $out .= " " . wfElementClean( 'ip', null, strval( $row->rev_user_text ) ) . "\n"; + } + $out .= " \n"; + + if( $row->rev_minor_edit ) { + $out .= " \n"; + } + if( $row->rev_comment != '' ) { + $out .= " " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n"; + } + + $text = strval( Revision::getRevisionText( $row ) ); + $out .= " " . wfElementClean( 'text', + array( 'xml:space' => 'preserve' ), + strval( $text ) ) . "\n"; + + $out .= " \n"; + + wfProfileOut( $fname ); + return $out; + } + +} + + +/** + * Base class for output stream; prints to stdout or buffer or whereever. + */ +class DumpOutput { + function writeOpenStream( $string ) { + $this->write( $string ); + } + + function writeCloseStream( $string ) { + $this->write( $string ); + } + + function writeOpenPage( $page, $string ) { + $this->write( $string ); + } + + function writeClosePage( $string ) { + $this->write( $string ); + } + + function writeRevision( $rev, $string ) { + $this->write( $string ); + } + + /** + * Override to write to a different stream type. + * @return bool + */ + function write( $string ) { + print $string; + } +} + +/** + * Stream outputter to send data to a file. + */ +class DumpFileOutput extends DumpOutput { + var $handle; + + function DumpFileOutput( $file ) { + $this->handle = fopen( $file, "wt" ); + } + + function write( $string ) { + fputs( $this->handle, $string ); + } +} + +/** + * Stream outputter to send data to a file via some filter program. + * Even if compression is available in a library, using a separate + * program can allow us to make use of a multi-processor system. + */ +class DumpPipeOutput extends DumpFileOutput { + function DumpPipeOutput( $command, $file = null ) { + if( !is_null( $file ) ) { + $command .= " > " . wfEscapeShellArg( $file ); + } + $this->handle = popen( $command, "w" ); + } +} + +/** + * Sends dump output via the gzip compressor. + */ +class DumpGZipOutput extends DumpPipeOutput { + function DumpGZipOutput( $file ) { + parent::DumpPipeOutput( "gzip", $file ); + } +} + +/** + * Sends dump output via the bgzip2 compressor. + */ +class DumpBZip2Output extends DumpPipeOutput { + function DumpBZip2Output( $file ) { + parent::DumpPipeOutput( "bzip2", $file ); + } +} + +/** + * Sends dump output via the p7zip compressor. + */ +class Dump7ZipOutput extends DumpPipeOutput { + function Dump7ZipOutput( $file ) { + $command = "7za a -si " . wfEscapeShellArg( $file ); + parent::DumpPipeOutput( $command ); + } +} + + + +/** + * Dump output filter class. + * This just does output filtering and streaming; XML formatting is done + * higher up, so be careful in what you do. + */ +class DumpFilter { + function DumpFilter( &$sink ) { + $this->sink =& $sink; + } + + function writeOpenStream( $string ) { + $this->sink->writeOpenStream( $string ); + } + + function writeCloseStream( $string ) { + $this->sink->writeCloseStream( $string ); + } + + function writeOpenPage( $page, $string ) { + $this->sendingThisPage = $this->pass( $page, $string ); + if( $this->sendingThisPage ) { + $this->sink->writeOpenPage( $page, $string ); + } + } + + function writeClosePage( $string ) { + if( $this->sendingThisPage ) { + $this->sink->writeClosePage( $string ); + $this->sendingThisPage = false; + } + } + + function writeRevision( $rev, $string ) { + if( $this->sendingThisPage ) { + $this->sink->writeRevision( $rev, $string ); + } + } + + /** + * Override for page-based filter types. + * @return bool + */ + function pass( $page, $string ) { + return true; + } +} + +/** + * Simple dump output filter to exclude all talk pages. + */ +class DumpNotalkFilter extends DumpFilter { + function pass( $page ) { + return Namespace::isTalk( $page->page_namespace ); + } +} + +/** + * Dump output filter to include or exclude pages in a given set of namespaces. + */ +class DumpNamespaceFilter extends DumpFilter { + var $invert = false; + var $match = array(); + + function DumpNamespaceFilter( &$sink, $param ) { + parent::DumpFilter( $sink ); + + $constants = array( + "NS_MAIN" => NS_MAIN, + "NS_TALK" => NS_TALK, + "NS_USER" => NS_USER, + "NS_USER_TALK" => NS_USER_TALK, + "NS_PROJECT" => NS_PROJECT, + "NS_PROJECT_TALK" => NS_PROJECT_TALK, + "NS_IMAGE" => NS_IMAGE, + "NS_IMAGE_TALK" => NS_IMAGE_TALK, + "NS_MEDIAWIKI" => NS_MEDIAWIKI, + "NS_MEDIAWIKI_TALK" => NS_MEDIAWIKI_TALK, + "NS_TEMPLATE" => NS_TEMPLATE, + "NS_TEMPLATE_TALK" => NS_TEMPLATE_TALK, + "NS_HELP" => NS_HELP, + "NS_HELP_TALK" => NS_HELP_TALK, + "NS_CATEGORY" => NS_CATEGORY, + "NS_CATEGORY_TALK" => NS_CATEGORY_TALK ); + + if( $param{0} == '!' ) { + $this->invert = true; + $param = substr( $param, 1 ); + } + + foreach( explode( ',', $param ) as $key ) { + $key = trim( $key ); + if( isset( $contants[$key] ) ) { + $ns = $constants[$key]; + $this->namespaces[$ns] = true; + } elseif( is_numeric( $key ) ) { + $ns = intval( $key ); + $this->namespaces[$ns] = true; + } + } + } + + function pass( $page ) { + $match = isset( $this->namespaces[$page->page_namespace] ); + return $this->invert xor $match; + } +} + + +/** + * Dump output filter to include only the last revision in each page sequence. + */ +class DumpLatestFilter extends DumpFilter { + var $page, $pageString, $rev, $revString; + + function writeOpenPage( $page, $string ) { + $this->page = $page; + $this->pageString = $string; + } + + function writeClosePage( $string ) { + if( $this->rev ) { + $this->sink->writeOpenPage( $this->page, $this->pageString ); + $this->sink->writeRevision( $this->rev, $this->revString ); + $this->sink->writeClosePage( $string ); + } + $this->rev = null; + $this->revString = null; + $this->page = null; + $this->pageString = null; + } + + function writeRevision( $rev, $string ) { + if( $rev->rev_id == $this->page->page_latest ) { + $this->rev = $rev; + $this->revString = $string; + } + } +} + +/** + * Base class for output stream; prints to stdout or buffer or whereever. + */ +class DumpMultiWriter { + function DumpMultiWriter( $sinks ) { + $this->sinks = $sinks; + $this->count = count( $sinks ); + } + + function writeOpenStream( $string ) { + for( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeOpenStream( $string ); + } + } + + function writeCloseStream( $string ) { + for( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeCloseStream( $string ); + } + } + + function writeOpenPage( $page, $string ) { + for( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeOpenPage( $page, $string ); + } + } + + function writeClosePage( $string ) { + for( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeClosePage( $string ); + } + } + + function writeRevision( $rev, $string ) { + for( $i = 0; $i < $this->count; $i++ ) { + $this->sinks[$i]->writeRevision( $rev, $string ); + } + } +} + + + +function wfTimestamp2ISO8601( $ts ) { + #2003-08-05T18:30:02Z + return preg_replace( '/^(....)(..)(..)(..)(..)(..)$/', '$1-$2-$3T$4:$5:$6Z', wfTimestamp( TS_MW, $ts ) ); +} + +function xmlsafe( $string ) { + $fname = 'xmlsafe'; + wfProfileIn( $fname ); + + /** + * The page may contain old data which has not been properly normalized. + * Invalid UTF-8 sequences or forbidden control characters will make our + * XML output invalid, so be sure to strip them out. + */ + $string = UtfNormal::cleanUp( $string ); + + $string = htmlspecialchars( $string ); + wfProfileOut( $fname ); + return $string; +} + +?> diff --git a/includes/SpecialExport.php b/includes/SpecialExport.php index 861addcc6c..99d02c5866 100644 --- a/includes/SpecialExport.php +++ b/includes/SpecialExport.php @@ -24,6 +24,7 @@ /** */ require_once( 'Revision.php' ); +require_once( 'Export.php' ); /** * @@ -67,378 +68,4 @@ function wfSpecialExport( $page = '' ) { " ); } -define( 'MW_EXPORT_FULL', 0 ); -define( 'MW_EXPORT_CURRENT', 1 ); - -define( 'MW_EXPORT_BUFFER', 0 ); -define( 'MW_EXPORT_STREAM', 1 ); - -/** - * @package MediaWiki - * @subpackage SpecialPage - */ -class WikiExporter { - var $pageCallback = null; - var $revCallback = null; - - /** - * If using MW_EXPORT_STREAM to stream a large amount of data, - * provide a database connection which is not managed by - * LoadBalancer to read from: some history blob types will - * make additional queries to pull source data while the - * main query is still running. - * - * @param Database $db - * @param int $history one of MW_EXPORT_FULL or MW_EXPORT_CURRENT - * @param int $buffer one of MW_EXPORT_BUFFER or MW_EXPORT_STREAM - */ - function WikiExporter( &$db, $history = MW_EXPORT_CURRENT, - $buffer = MW_EXPORT_BUFFER ) { - $this->db =& $db; - $this->history = $history; - $this->buffer = $buffer; - } - - /** - * Set a callback to be called after each page in the output - * stream is closed. The callback will be passed a database row - * object with the last revision output. - * - * A set callback can be removed by passing null here. - * - * @param mixed $callback - */ - function setPageCallback( $callback ) { - $this->pageCallback = $callback; - } - - /** - * Set a callback to be called after each revision in the output - * stream is closed. The callback will be passed a database row - * object with the revision data. - * - * A set callback can be removed by passing null here. - * - * @param mixed $callback - */ - function setRevisionCallback( $callback ) { - $this->revCallback = $callback; - } - - /** - * Returns the export schema version. - * @return string - */ - function schemaVersion() { - return "0.3"; - } - - /** - * Opens the XML output stream's root element. - * This does not include an xml directive, so is safe to include - * as a subelement in a larger XML stream. Namespace and XML Schema - * references are included. - * - * To capture the stream to a string, use PHP's output buffering - * functions. Output will be encoded in UTF-8. - */ - function openStream() { - global $wgContLanguageCode; - $ver = $this->schemaVersion(); - print wfElement( 'mediawiki', array( - 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/", - 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance", - 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " . - "http://www.mediawiki.org/xml/export-$ver.xsd", - 'version' => $ver, - 'xml:lang' => $wgContLanguageCode ), - null ) . "\n"; - $this->siteInfo(); - } - - function siteInfo() { - $info = array( - $this->sitename(), - $this->homelink(), - $this->generator(), - $this->caseSetting(), - $this->namespaces() ); - print "\n"; - foreach( $info as $item ) { - print " $item\n"; - } - print "\n"; - } - - function sitename() { - global $wgSitename; - return wfElement( 'sitename', array(), $wgSitename ); - } - - function generator() { - global $wgVersion; - return wfElement( 'generator', array(), "MediaWiki $wgVersion" ); - } - - function homelink() { - $page = Title::newFromText( wfMsgForContent( 'mainpage' ) ); - return wfElement( 'base', array(), $page->getFullUrl() ); - } - - function caseSetting() { - global $wgCapitalLinks; - // "case-insensitive" option is reserved for future - $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive'; - return wfElement( 'case', array(), $sensitivity ); - } - - function namespaces() { - global $wgContLang; - $spaces = "\n"; - foreach( $wgContLang->getFormattedNamespaces() as $ns => $title ) { - $spaces .= ' ' . wfElement( 'namespace', array( 'key' => $ns ), $title ) . "\n"; - } - $spaces .= " "; - return $spaces; - } - - /** - * Closes the output stream with the closing root element. - * Call when finished dumping things. - */ - function closeStream() { - print "\n"; - } - - /** - * Dumps a series of page and revision records for all pages - * in the database, either including complete history or only - * the most recent version. - */ - function allPages() { - return $this->dumpFrom( '' ); - } - - /** - * Dumps a series of page and revision records for those pages - * in the database falling within the page_id range given. - * @param int $start Inclusive lower limit (this id is included) - * @param int $end Exclusive upper limit (this id is not included) - * If 0, no upper limit. - */ - function pagesByRange( $start, $end ) { - $condition = 'page_id >= ' . intval( $start ); - if( $end ) { - $condition .= ' AND page_id < ' . intval( $end ); - } - return $this->dumpFrom( $condition ); - } - - /** - * @param Title $title - */ - function pageByTitle( $title ) { - return $this->dumpFrom( - 'page_namespace=' . $title->getNamespace() . - ' AND page_title=' . $this->db->addQuotes( $title->getDbKey() ) ); - } - - function pageByName( $name ) { - $title = Title::newFromText( $name ); - if( is_null( $title ) ) { - return new WikiError( "Can't export invalid title" ); - } else { - return $this->pageByTitle( $title ); - } - } - - function pagesByName( $names ) { - foreach( $names as $name ) { - $this->pageByName( $name ); - } - } - - - // -------------------- private implementation below -------------------- - - function dumpFrom( $cond = '' ) { - $fname = 'WikiExporter::dumpFrom'; - wfProfileIn( $fname ); - - $page = $this->db->tableName( 'page' ); - $revision = $this->db->tableName( 'revision' ); - $text = $this->db->tableName( 'text' ); - - if( $this->history == MW_EXPORT_FULL ) { - $join = 'page_id=rev_page'; - } elseif( $this->history == MW_EXPORT_CURRENT ) { - $join = 'page_id=rev_page AND page_latest=rev_id'; - } else { - wfProfileOut( $fname ); - return new WikiError( "$fname given invalid history dump type." ); - } - $where = ( $cond == '' ) ? '' : "$cond AND"; - - if( $this->buffer == MW_EXPORT_STREAM ) { - $prev = $this->db->bufferResults( false ); - } - if( $cond == '' ) { - // Optimization hack for full-database dump - $pageindex = 'FORCE INDEX (PRIMARY)'; - $revindex = 'FORCE INDEX(page_timestamp)'; - } else { - $pageindex = ''; - $revindex = ''; - } - $result = $this->db->query( - "SELECT * FROM - $page $pageindex, - $revision $revindex, - $text - WHERE $where $join AND rev_text_id=old_id - ORDER BY page_id", $fname ); - $wrapper = $this->db->resultObject( $result ); - $this->outputStream( $wrapper ); - - if( $this->buffer == MW_EXPORT_STREAM ) { - $this->db->bufferResults( $prev ); - } - - wfProfileOut( $fname ); - } - - /** - * Runs through a query result set dumping page and revision records. - * The result set should be sorted/grouped by page to avoid duplicate - * page records in the output. - * - * The result set will be freed once complete. Should be safe for - * streaming (non-buffered) queries, as long as it was made on a - * separate database connection not managed by LoadBalancer; some - * blob storage types will make queries to pull source data. - * - * @param ResultWrapper $resultset - * @access private - */ - function outputStream( $resultset ) { - $last = null; - while( $row = $resultset->fetchObject() ) { - if( is_null( $last ) || - $last->page_namespace != $row->page_namespace || - $last->page_title != $row->page_title ) { - if( isset( $last ) ) { - $this->closePage( $last ); - } - $this->openPage( $row ); - $last = $row; - } - $this->dumpRev( $row ); - } - if( isset( $last ) ) { - $this->closePage( $last ); - } - $resultset->free(); - } - - /** - * Opens a section on the output stream, with data - * from the given database row. - * - * @param object $row - * @access private - */ - function openPage( $row ) { - print "\n"; - $title = Title::makeTitle( $row->page_namespace, $row->page_title ); - print ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n"; - print ' ' . wfElement( 'id', array(), strval( $row->page_id ) ) . "\n"; - if( '' != $row->page_restrictions ) { - print ' ' . wfElement( 'restrictions', array(), - strval( $row->page_restrictions ) ) . "\n"; - } - } - - /** - * Closes a section on the output stream. - * If a per-page callback has been set, it will be called - * and passed the last database row used for this page. - * - * @param object $row - * @access private - */ - function closePage( $row ) { - print "\n"; - if( isset( $this->pageCallback ) ) { - call_user_func( $this->pageCallback, $row ); - } - } - - /** - * Dumps a section on the output stream, with - * data filled in from the given database row. - * - * @param object $row - * @access private - */ - function dumpRev( $row ) { - $fname = 'WikiExporter::dumpRev'; - wfProfileIn( $fname ); - - print " \n"; - print " " . wfElement( 'id', null, strval( $row->rev_id ) ) . "\n"; - - $ts = wfTimestamp2ISO8601( strval( $row->rev_timestamp ) ); - print " " . wfElement( 'timestamp', null, $ts ) . "\n"; - - print " \n"; - if( $row->rev_user ) { - print " " . wfElementClean( 'username', null, strval( $row->rev_user_text ) ) . "\n"; - print " " . wfElement( 'id', null, strval( $row->rev_user ) ) . "\n"; - } else { - print " " . wfElementClean( 'ip', null, strval( $row->rev_user_text ) ) . "\n"; - } - print " \n"; - - if( $row->rev_minor_edit ) { - print " \n"; - } - if( $row->rev_comment != '' ) { - print " " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n"; - } - - $text = strval( Revision::getRevisionText( $row ) ); - print " " . wfElementClean( 'text', array( 'xml:space' => 'preserve' ), $text ) . "\n"; - - print " \n"; - - wfProfileOut( $fname ); - - if( isset( $this->revCallback ) ) { - call_user_func( $this->revCallback, $row ); - } - } - -} - -function wfTimestamp2ISO8601( $ts ) { - #2003-08-05T18:30:02Z - return preg_replace( '/^(....)(..)(..)(..)(..)(..)$/', '$1-$2-$3T$4:$5:$6Z', wfTimestamp( TS_MW, $ts ) ); -} - -function xmlsafe( $string ) { - $fname = 'xmlsafe'; - wfProfileIn( $fname ); - - /** - * The page may contain old data which has not been properly normalized. - * Invalid UTF-8 sequences or forbidden control characters will make our - * XML output invalid, so be sure to strip them out. - */ - $string = UtfNormal::cleanUp( $string ); - - $string = htmlspecialchars( $string ); - wfProfileOut( $fname ); - return $string; -} - ?> diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php index c5b05b9bc6..afa9dba5a4 100644 --- a/maintenance/dumpBackup.php +++ b/maintenance/dumpBackup.php @@ -40,9 +40,76 @@ class BackupDumper { var $skipFooter = false; // don't output var $startId = 0; var $endId = 0; + var $sink = null; // Output filters - function BackupDumper() { + function BackupDumper( $args ) { $this->stderr = fopen( "php://stderr", "wt" ); + $this->sink = $this->processArgs( $args ); + } + + /** + * @param array $args + * @return array + * @static + */ + function processArgs( $args ) { + $outputTypes = array( + 'file' => 'DumpFileOutput', + 'gzip' => 'DumpGZipOutput', + 'bzip2' => 'DumpBZip2Output', + '7zip' => 'Dump7ZipOutput' ); + $filterTypes = array( + 'latest' => 'DumpLatestFilter', + 'notalk' => 'DumpNotalkFilter', + 'namespace' => 'DumpNamespaceFilter' ); + $sink = null; + $sinks = array(); + foreach( $args as $arg ) { + if( preg_match( '/^--(.+?)(?:=(.+?)(?::(.+?))?)?$/', $arg, $matches ) ) { + @list( $full, $opt, $val, $param ) = $matches; + switch( $opt ) { + case "output": + if( !is_null( $sink ) ) { + $sinks[] = $sink; + } + if( !isset( $outputTypes[$val] ) ) { + die( "Unrecognized output sink type '$val'\n" ); + } + $type = $outputTypes[$val]; + $sink = new $type( $param ); + break; + case "filter": + if( is_null( $sink ) ) { + $this->progress( "Warning: assuming stdout for filter output\n" ); + $sink = new DumpOutput(); + } + if( !isset( $filterTypes[$val] ) ) { + die( "Unrecognized filter type '$val'\n" ); + } + $type = $filterTypes[$val]; + $filter = new $type( $sink, $param ); + + // references are lame in php... + unset( $sink ); + $sink = $filter; + + break; + default: + //die( "Unrecognized dump option'$opt'\n" ); + } + } + } + + if( is_null( $sink ) ) { + $sink = new DumpOutput(); + } + $sinks[] = $sink; + + if( count( $sinks ) > 1 ) { + return new DumpMultiWriter( $sinks ); + } else { + return $sink; + } } function dump( $history ) { @@ -61,8 +128,9 @@ class BackupDumper { $db =& $this->backupDb(); $exporter = new WikiExporter( $db, $history, MW_EXPORT_STREAM ); - $exporter->setPageCallback( array( &$this, 'reportPage' ) ); - $exporter->setRevisionCallback( array( &$this, 'revCount' ) ); + + $wrapper = new ExportProgressFilter( $this->sink, $this ); + $exporter->setOutputSink( $wrapper ); if( !$this->skipHeader ) $exporter->openStream(); @@ -100,12 +168,12 @@ class BackupDumper { : $wgDBserver; } - function reportPage( $page ) { + function reportPage() { $this->pageCount++; $this->report(); } - function revCount( $rev ) { + function revCount() { $this->revCount++; } @@ -140,7 +208,25 @@ class BackupDumper { } } -$dumper = new BackupDumper(); +class ExportProgressFilter extends DumpFilter { + function ExportProgressFilter( &$sink, &$progress ) { + parent::DumpFilter( $sink ); + $this->progress = $progress; + } + + function writeClosePage( $string ) { + parent::writeClosePage( $string ); + $this->progress->reportPage(); + } + + function writeRevision( $rev, $string ) { + parent::writeRevision( $rev, $string ); + $this->progress->revCount(); + } +} + +$dumper = new BackupDumper( $argv ); + if( isset( $options['quiet'] ) ) { $dumper->reporting = false; } -- 2.20.1