From aad99df7d42269c47b7788c7ab1167ce7e15d179 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 8 May 2005 08:17:12 +0000 Subject: [PATCH] * HistoryBlobStub: the last-used HistoryBlob is kept open to speed up multiple-revision pulls * Special:Export now includes page, revision, and user id numbers by default (previously this was disabled for no particular reason) * dumpBackup.php can dump the full database to Export XML, with current revisions only or complete histories. --- RELEASE-NOTES | 6 + includes/Database.php | 3 +- includes/GlobalFunctions.php | 17 ++ includes/HistoryBlob.php | 62 +++++-- includes/SpecialExport.php | 346 ++++++++++++++++++++++++++--------- maintenance/dumpBackup.php | 133 ++++++++++++++ 6 files changed, 457 insertions(+), 110 deletions(-) create mode 100644 maintenance/dumpBackup.php diff --git a/RELEASE-NOTES b/RELEASE-NOTES index a77c52f42a..d236a640ee 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -174,6 +174,12 @@ Various bugfixes, small features, and a few experimental things: that the cur table be left in place until/unless such fields are migrated into the main text store. * (bug 1692) Fix margin on unwatch tab +* HistoryBlobStub: the last-used HistoryBlob is kept open to speed up + multiple-revision pulls +* Special:Export now includes page, revision, and user id numbers by + default (previously this was disabled for no particular reason) +* dumpBackup.php can dump the full database to Export XML, with current + revisions only or complete histories. === Caveats === diff --git a/includes/Database.php b/includes/Database.php index 73580316e0..e69e49b9e8 100644 --- a/includes/Database.php +++ b/includes/Database.php @@ -214,7 +214,8 @@ class Database { if ( $this->mFlags & DBO_PERSISTENT ) { @/**/$this->mConn = mysql_pconnect( $server, $user, $password ); } else { - @/**/$this->mConn = mysql_connect( $server, $user, $password ); + # Create a new connection... + @/**/$this->mConn = mysql_connect( $server, $user, $password, true ); } if ( $dbName != '' ) { diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index 55b8db401d..b52f91cbd4 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -1208,4 +1208,21 @@ function wfElement( $element, $attribs = array(), $contents = '') { return $out; } +/** + * Format an XML element as with wfElement(), but run text through the + * UtfNormal::cleanUp() validator first to ensure that no invalid UTF-8 + * is passed. + * + * @param string $element + * @param array $attribs Name=>value pairs. Values will be escaped. + * @param bool $contents NULL to make an open tag only; '' for a contentless closed tag (default) + * @return string + */ +function wfElementClean( $element, $attribs = array(), $contents = '') { + if( $attribs ) { + $attribs = array_map( array( 'UtfNormal', 'cleanUp' ), $attribs ); + } + return wfElement( $element, $attribs, UtfNormal::cleanUp( $contents ) ); +} + ?> diff --git a/includes/HistoryBlob.php b/includes/HistoryBlob.php index 5b1f453e84..a210f7c99b 100644 --- a/includes/HistoryBlob.php +++ b/includes/HistoryBlob.php @@ -167,6 +167,17 @@ class ConcatenatedGzipHistoryBlob extends HistoryBlob } } + +/** + * One-step cache variable to hold base blobs; operations that + * pull multiple revisions may often pull multiple times from + * the same blob. By keeping the last-used one open, we avoid + * redundant unserialization and decompression overhead. + */ +global $wgBlobCache; +$wgBlobCache = array(); + + /** * @package MediaWiki */ @@ -188,27 +199,37 @@ class HistoryBlobStub { /** @todo document */ function getText() { - $dbr =& wfGetDB( DB_SLAVE ); - $row = $dbr->selectRow( 'text', array( 'old_flags', 'old_text' ), array( 'old_id' => $this->mOldId ) ); - if( !$row ) { - return false; - } - $flags = explode( ',', $row->old_flags ); - if( !in_array( 'object', $flags ) ) { - return false; - } - - if( in_array( 'gzip', $flags ) ) { - // This shouldn't happen, but a bug in the compress script - // may at times gzip-compress a HistoryBlob object row. - $obj = unserialize( gzinflate( $row->old_text ) ); + global $wgBlobCache; + if( isset( $wgBlobCache[$this->mOldId] ) ) { + $obj = $wgBlobCache[$this->mOldId]; } else { - $obj = unserialize( $row->old_text ); - } - - if( !is_object( $obj ) ) { - // Correct for old double-serialization bug. - $obj = unserialize( $obj ); + $dbr =& wfGetDB( DB_SLAVE ); + $row = $dbr->selectRow( 'text', array( 'old_flags', 'old_text' ), array( 'old_id' => $this->mOldId ) ); + if( !$row ) { + return false; + } + $flags = explode( ',', $row->old_flags ); + if( !in_array( 'object', $flags ) ) { + return false; + } + + if( in_array( 'gzip', $flags ) ) { + // This shouldn't happen, but a bug in the compress script + // may at times gzip-compress a HistoryBlob object row. + $obj = unserialize( gzinflate( $row->old_text ) ); + } else { + $obj = unserialize( $row->old_text ); + } + + if( !is_object( $obj ) ) { + // Correct for old double-serialization bug. + $obj = unserialize( $obj ); + } + + // Save this item for reference; if pulling many + // items in a row we'll likely use it again. + $obj->uncompress(); + $wgBlobCache = array( $this->mOldId => $obj ); } return $obj->getItem( $this->mHash ); } @@ -257,4 +278,5 @@ class HistoryBlobCurStub { } } + ?> diff --git a/includes/SpecialExport.php b/includes/SpecialExport.php index fa4424fb80..722c1191e6 100644 --- a/includes/SpecialExport.php +++ b/includes/SpecialExport.php @@ -43,8 +43,13 @@ function wfSpecialExport( $page = '' ) { $wgOut->disable(); header( "Content-type: application/xml; charset=utf-8" ); $pages = explode( "\n", $page ); - $xml = pages2xml( $pages, $curonly ); - echo $xml; + + $db =& wfGetDB( DB_SLAVE ); + $history = $curonly ? MW_EXPORT_CURRENT : MW_EXPORT_FULL; + $exporter = new WikiExporter( $db, $history ); + $exporter->openStream(); + $exporter->pagesByName( $pages ); + $exporter->closeStream(); return; } @@ -62,114 +67,276 @@ function wfSpecialExport( $page = '' ) { " ); } -function pages2xml( $pages, $curonly = false ) { - $fname = 'pages2xml'; - wfProfileIn( $fname ); +define( 'MW_EXPORT_FULL', 0 ); +define( 'MW_EXPORT_CURRENT', 1 ); + +define( 'MW_EXPORT_BUFFER', 0 ); +define( 'MW_EXPORT_STREAM', 1 ); + +class WikiExporter { + var $pageCallback = null; + var $revCallback = null; - global $wgContLanguageCode, $wgInputEncoding, $wgContLang; - $xml = '' . "\n" . - '' . "\n"; - foreach( $pages as $page ) { - $xml .= page2xml( $page, $curonly ); + /** + * If using MW_EXPORT_STREAM to stream a large amount of data, + * provide a database connection which is not managed by + * LoadBalancer to read from: some history blob types will + * make additional queries to pull source data while the + * main query is still running. + * + * @param Database $db + * @param int $history one of MW_EXPORT_FULL or MW_EXPORT_CURRENT + * @param int $buffer one of MW_EXPORT_BUFFER or MW_EXPORT_STREAM + */ + function WikiExporter( &$db, $history = MW_EXPORT_CURRENT, + $buffer = MW_EXPORT_BUFFER ) { + $this->db =& $db; + $this->history = $history; + $this->buffer = $buffer; } - $xml .= "\n"; - if($wgInputEncoding != "utf-8") - $xml = $wgContLang->iconv( $wgInputEncoding, "utf-8", $xml ); - wfProfileOut( $fname ); - return $xml; -} - -function page2xml( $page, $curonly, $full = false ) { - global $wgLang; - $fname = 'page2xml'; - wfProfileIn( $fname ); + /** + * Set a callback to be called after each page in the output + * stream is closed. The callback will be passed a database row + * object with the last revision output. + * + * A set callback can be removed by passing null here. + * + * @param mixed $callback + */ + function setPageCallback( $callback ) { + $this->pageCallback = $callback; + } - $title = Title::NewFromText( $page ); - if( !$title ) { - wfProfileOut( $fname ); - return ""; + /** + * Set a callback to be called after each revision in the output + * stream is closed. The callback will be passed a database row + * object with the revision data. + * + * A set callback can be removed by passing null here. + * + * @param mixed $callback + */ + function setRevCallback( $callback ) { + $this->revCallback = $callback; } - - $dbr =& wfGetDB( DB_SLAVE ); - $s = $dbr->selectRow( 'page', - array( 'page_id', 'page_restrictions' ), - array( 'page_namespace' => $title->getNamespace(), - 'page_title' => $title->getDbkey() ) ); - if( $s ) { - $tl = xmlsafe( $title->getPrefixedText() ); - $xml = " \n"; - $xml .= " $tl\n"; - - if( $full ) { - $xml .= " $s->page_id\n"; + + /** + * Opens the XML output stream's root element. + * This does not include an xml directive, so is safe to include + * as a subelement in a larger XML stream. Namespace and XML Schema + * references are included. + * + * To capture the stream to a string, use PHP's output buffering + * functions. Output will be encoded in UTF-8. + */ + function openStream() { + global $wgContLanguageCode; + print wfElement( 'mediawiki', array( + 'xmlns' => 'http://www.mediawiki.org/xml/export-0.1/', + 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance', + 'xsi:schemaLocation' => 'http://www.mediawiki.org/xml/export-0.1/ ' . + 'http://www.mediawiki.org/xml/export-0.1.xsd', + 'version' => '0.1', + 'xml:lang' => $wgContLanguageCode ), + null ) . "\n"; + } + + /** + * Closes the output stream with the closing root element. + * Call when finished dumping things. + */ + function closeStream() { + print "\n"; + } + + /** + * Dumps a series of page and revision records for all pages + * in the database, either including complete history or only + * the most recent version. + * + * + * @param Database $db + */ + function allPages() { + return $this->dumpFrom( '' ); + } + + /** + * @param Title $title + */ + function pageByTitle( $title ) { + return $this->dumpFrom( + 'page_namespace=' . $title->getNamespace() . + ' AND page_title=' . $this->db->addQuotes( $title->getDbKey() ) ); + } + + function pageByName( $name ) { + $title = Title::newFromText( $name ); + if( is_null( $title ) ) { + return WikiError( "Can't export invalid title" ); + } else { + return $this->pageByTitle( $title ); } - if( $s->page_restrictions ) { - $xml .= " " . xmlsafe( $s->page_restrictions ) . "\n"; + } + + function pagesByName( $names ) { + foreach( $names as $name ) { + $this->pageByName( $name ); } + } - if( $curonly ) { - $res = Revision::fetchRevision( $title ); + + // -------------------- private implementation below -------------------- + + function dumpFrom( $cond = '' ) { + $fname = 'WikiExporter::dumpFrom'; + wfProfileIn( $fname ); + + $page = $this->db->tableName( 'page' ); + $revision = $this->db->tableName( 'revision' ); + $text = $this->db->tableName( 'text' ); + + if( $this->history == MW_EXPORT_FULL ) { + $join = 'page_id=rev_page'; + } elseif( $this->history == MW_EXPORT_CURRENT ) { + $join = 'page_id=rev_page AND page_latest=rev_id'; } else { - $res = Revision::fetchAllRevisions( $title ); + wfProfileOut( $fname ); + return new WikiError( "$fname given invalid history dump type." ); } - if( $res ) { - while( $s = $res->fetchObject() ) { - $rev = new Revision( $s ); - $xml .= revision2xml( $rev, $full, false ); - } - $res->free(); + $where = ( $cond == '' ) ? '' : "$cond AND"; + + if( $this->buffer == MW_EXPORT_STREAM ) { + $prev = $this->db->bufferResults( false ); + } + $result = $this->db->query( + "SELECT * FROM + $page FORCE INDEX (PRIMARY), + $revision FORCE INDEX(page_timestamp), + $text + WHERE $where $join AND rev_text_id=old_id + ORDER BY page_id", $fname ); + $wrapper = $this->db->resultObject( $result ); + $this->outputStream( $wrapper ); + + if( $this->buffer == MW_EXPORT_STREAM ) { + $this->db->bufferResults( $prev ); } - $xml .= " \n"; - wfProfileOut( $fname ); - return $xml; - } else { wfProfileOut( $fname ); - return ""; } -} - -/** - * @return string - * @param Revision $rev - * @param bool $full - * @access private - */ -function revision2xml( $rev, $full ) { - $fname = 'revision2xml'; - wfProfileIn( $fname ); - - $xml = " \n"; - if( $full ) - $xml .= " " . $rev->getId() . "\n"; - $ts = wfTimestamp2ISO8601( $rev->getTimestamp() ); - $xml .= " $ts\n"; + /** + * Runs through a query result set dumping page and revision records. + * The result set should be sorted/grouped by page to avoid duplicate + * page records in the output. + * + * The result set will be freed once complete. Should be safe for + * streaming (non-buffered) queries, as long as it was made on a + * separate database connection not managed by LoadBalancer; some + * blob storage types will make queries to pull source data. + * + * @param ResultWrapper $resultset + * @access private + */ + function outputStream( $resultset ) { + $last = null; + while( $row = $resultset->fetchObject() ) { + if( is_null( $last ) || + $last->page_namespace != $row->page_namespace || + $last->page_title != $row->page_title ) { + if( isset( $last ) ) { + $this->closePage( $last ); + } + $this->openPage( $row ); + $last = $row; + } + $this->dumpRev( $row ); + } + if( isset( $last ) ) { + $this->closePage( $last ); + } + $resultset->free(); + } - if( $rev->getUser() ) { - $u = "" . xmlsafe( $rev->getUserText() ) . ""; - if( $full ) - $u .= "" . $rev->getUser() . ""; - } else { - $u = "" . xmlsafe( $rev->getUserText() ) . ""; + /** + * Opens a section on the output stream, with data + * from the given database row. + * + * @param object $row + * @access private + */ + function openPage( $row ) { + print "\n"; + $title = Title::makeTitle( $row->page_namespace, $row->page_title ); + print ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n"; + print ' ' . wfElement( 'id', array(), $row->page_id ) . "\n"; + if( '' != $row->page_restrictions ) { + print ' ' . wfElement( 'restrictions', array(), + $row->page_restrictions ) . "\n"; + } } - $xml .= " $u\n"; - if( $rev->isMinor() ) { - $xml .= " \n"; + /** + * Closes a section on the output stream. + * If a per-page callback has been set, it will be called + * and passed the last database row used for this page. + * + * @param object $row + * @access private + */ + function closePage( $row ) { + print "\n"; + if( isset( $this->pageCallback ) ) { + call_user_func( $this->pageCallback, $row ); + } } - if($rev->getComment() != "") { - $c = xmlsafe( $rev->getComment() ); - $xml .= " $c\n"; + + /** + * Dumps a section on the output stream, with + * data filled in from the given database row. + * + * @param object $row + * @access private + */ + function dumpRev( $row ) { + $fname = 'WikiExporter::dumpRev'; + wfProfileIn( $fname ); + + print " \n"; + print " " . wfElement( 'id', null, $row->rev_id ) . "\n"; + + $ts = wfTimestamp2ISO8601( $row->rev_timestamp ); + print " " . wfElement( 'timestamp', null, $ts ) . "\n"; + + print " "; + if( $row->rev_user ) { + print wfElementClean( 'username', null, $row->rev_user_text ); + print wfElement( 'id', null, $row->rev_user ); + } else { + print wfElementClean( 'ip', null, $row->rev_user_text ); + } + print "\n"; + + if( $row->rev_minor_edit ) { + print " \n"; + } + if( $row->rev_comment != '' ) { + print " " . wfElementClean( 'comment', null, $row->rev_comment ) . "\n"; + } + + $text = Revision::getRevisionText( $row ); + print " " . wfElementClean( 'text', array(), $text ) . "\n"; + print " \n"; + + wfProfileOut( $fname ); + + if( isset( $this->revCallback ) ) { + call_user_func( $this->revCallback, $row ); + } } - $t = xmlsafe( $rev->getText() ); - - $xml .= " $t\n"; - $xml .= " \n"; - wfProfileOut( $fname ); - return $xml; } function wfTimestamp2ISO8601( $ts ) { @@ -192,4 +359,5 @@ function xmlsafe( $string ) { wfProfileOut( $fname ); return $string; } + ?> diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php new file mode 100644 index 0000000000..bc4ce7a5f1 --- /dev/null +++ b/maintenance/dumpBackup.php @@ -0,0 +1,133 @@ + + * http://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @package MediaWiki + * @subpackage SpecialPage + */ + +$options = array( 'full', 'current' ); + +require_once( 'commandLine.inc' ); +require_once( 'SpecialExport.php' ); + +class BackupDumper { + var $reportingInterval = 100; + var $reporting = true; + var $pageCount = 0; + var $revCount = 0; + + function BackupDumper() { + $this->stderr = fopen( "php://stderr", "wt" ); + } + + function dump( $history ) { + # This shouldn't happen if on console... ;) + header( 'Content-type: text/html; charset=UTF-8' ); + + # Notice messages will foul up your XML output even if they're + # relatively harmless. + ini_set( 'display_errors', false ); + + $this->startTime = wfTime(); + + $db =& $this->backupDb(); + $exporter = new WikiExporter( $db, $history, MW_EXPORT_STREAM ); + $exporter->setPageCallback( array( &$this, 'reportPage' ) ); + $exporter->setRevCallback( array( &$this, 'revCount' ) ); + + $exporter->openStream(); + $exporter->allPages(); + $exporter->closeStream(); + + $this->report( true ); + } + + function &backupDb() { + global $wgDBadminuser, $wgDBadminpassword; + global $wgDBserver, $wgDBname; + $db =& new Database( $wgDBserver, $wgDBadminuser, $wgDBadminpassword, $wgDBname ); + return $db; + } + + function reportPage( $page ) { + $this->pageCount++; + $this->report(); + } + + function revCount( $rev ) { + $this->revCount++; + } + + function report( $final = false ) { + if( $final xor ( $this->pageCount % $this->reportingInterval == 0 ) ) { + $this->showReport(); + } + } + + function showReport() { + if( $this->reporting ) { + $delta = wfTime() - $this->startTime; + if( $delta ) { + $rate = $this->pageCount / $delta; + $revrate = $this->revCount / $delta; + } else { + $rate = '-'; + $revrate = '-'; + } + $this->progress( "$this->pageCount ($rate pages/sec $revrate revs/sec)" ); + } + } + + function progress( $string ) { + fwrite( $this->stderr, $string . "\n" ); + } +} + +$dumper = new BackupDumper(); +if( isset( $options['quiet'] ) ) { + $dumper->reporting = false; +} +if( isset( $options['report'] ) ) { + $dumper->reportingInterval = IntVal( $options['report'] ); +} +if( isset( $options['full'] ) ) { + $dumper->dump( MW_EXPORT_FULL ); +} elseif( isset( $options['current'] ) ) { + $dumper->dump( MW_EXPORT_CURRENT ); +} else { + $dumper->progress( << [] +Actions: + --full Dump complete history of every page. + --current Includes only the latest revision of each page. +Options: + --quiet Don't dump status reports to stderr. + --report=n Report position and speed after every n pages processed. + (Default: 100) +END +); +} + +?> \ No newline at end of file -- 2.20.1