3 * Script that postprocesses XML dumps from dumpBackup.php to add page text
5 * Copyright (C) 2005 Brion Vibber <brion@pobox.com>
6 * http://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
24 * @ingroup Maintenance
27 $originalDir = getcwd();
29 require_once( dirname( __FILE__
) . '/commandLine.inc' );
30 require_once( 'backup.inc' );
33 * @ingroup Maintenance
35 class TextPassDumper
extends BackupDumper
{
37 var $input = "php://stdin";
38 var $history = WikiExporter
::FULL
;
40 var $prefetchCount = 0;
41 var $prefetchCountLast = 0;
42 var $fetchCountLast = 0;
45 var $maxConsecutiveFailedTextRetrievals = 200;
46 var $failureTimeout = 5; // Seconds to sleep after db failure
50 var $spawnProc = false;
51 var $spawnWrite = false;
52 var $spawnRead = false;
53 var $spawnErr = false;
55 var $xmlwriterobj = false;
57 // when we spend more than maxTimeAllowed seconds on this run, we continue
58 // processing until we write out the next complete page, then save output file(s),
59 // rename it/them and open new one(s)
60 var $maxTimeAllowed = 0; // 0 = no limit
61 var $timeExceeded = false;
62 var $firstPageWritten = false;
63 var $lastPageWritten = false;
64 var $checkpointJustWritten = false;
65 var $checkpointFiles = array();
74 * Drop the database connection $this->db and try to get a new one.
76 * This function tries to get a /different/ connection if this is
77 * possible. Hence, (if this is possible) it switches to a different
78 * failover upon each call.
80 * This function resets $this->lb and closes all connections on it.
85 // Cleaning up old connections
86 if ( isset( $this->lb
) ) {
87 $this->lb
->closeAll();
91 if ( isset( $this->db
) && $this->db
->isOpen() )
93 throw new MWException( 'DB is set and has not been closed by the Load Balancer' );
99 // Trying to set up new connection.
100 // We do /not/ retry upon failure, but delegate to encapsulating logic, to avoid
101 // individually retrying at different layers of code.
103 // 1. The LoadBalancer.
105 $this->lb
= wfGetLBFactory()->newMainLB();
106 } catch (Exception
$e) {
107 throw new MWException( __METHOD__
. " rotating DB failed to obtain new load balancer (" . $e->getMessage() . ")" );
111 // 2. The Connection, through the load balancer.
113 $this->db
= $this->lb
->getConnection( DB_SLAVE
, 'backup' );
114 } catch (Exception
$e) {
115 throw new MWException( __METHOD__
. " rotating DB failed to obtain new database (" . $e->getMessage() . ")" );
120 function initProgress( $history ) {
121 parent
::initProgress();
122 $this->timeOfCheckpoint
= $this->startTime
;
125 function dump( $history, $text = WikiExporter
::TEXT
) {
126 // This shouldn't happen if on console... ;)
127 header( 'Content-type: text/html; charset=UTF-8' );
129 // Notice messages will foul up your XML output even if they're
130 // relatively harmless.
131 if ( ini_get( 'display_errors' ) )
132 ini_set( 'display_errors', 'stderr' );
134 $this->initProgress( $this->history
);
136 // We are trying to get an initial database connection to avoid that the
137 // first try of this request's first call to getText fails. However, if
138 // obtaining a good DB connection fails it's not a serious issue, as
139 // getText does retry upon failure and can start without having a working
143 } catch (Exception
$e) {
144 // We do not even count this as failure. Just let eventual
146 $this->progress( "Getting initial DB connection failed (" .
147 $e->getMessage() . ")" );
150 $this->egress
= new ExportProgressFilter( $this->sink
, $this );
152 // it would be nice to do it in the constructor, oh well. need egress set
153 $this->finalOptionCheck();
155 // we only want this so we know how to close a stream :-P
156 $this->xmlwriterobj
= new XmlDumpWriter();
158 $input = fopen( $this->input
, "rt" );
159 $result = $this->readDump( $input );
161 if ( WikiError
::isError( $result ) ) {
162 throw new MWException( $result->getMessage() );
165 if ( $this->spawnProc
) {
169 $this->report( true );
172 function processOption( $opt, $val, $param ) {
174 $url = $this->processFileOpt( $val, $param );
178 require_once "$IP/maintenance/backupPrefetch.inc";
179 $this->prefetch
= new BaseDump( $url );
185 $this->maxTimeAllowed
= intval($val)*60;
187 case 'checkpointfile':
188 $this->checkpointFiles
[] = $val;
191 $this->history
= WikiExporter
::CURRENT
;
194 $this->history
= WikiExporter
::FULL
;
205 function processFileOpt( $val, $param ) {
206 $fileURIs = explode(';',$param);
207 foreach ( $fileURIs as $URI ) {
213 $newURI = "compress.zlib://$URI";
216 $newURI = "compress.bzip2://$URI";
219 $newURI = "mediawiki.compress.7z://$URI";
224 $newFileURIs[] = $newURI;
226 $val = implode( ';', $newFileURIs );
231 * Overridden to include prefetch ratio if enabled.
233 function showReport() {
234 if ( !$this->prefetch
) {
235 parent
::showReport();
239 if ( $this->reporting
) {
240 $now = wfTimestamp( TS_DB
);
242 $deltaAll = wfTime() - $this->startTime
;
243 $deltaPart = wfTime() - $this->lastTime
;
244 $this->pageCountPart
= $this->pageCount
- $this->pageCountLast
;
245 $this->revCountPart
= $this->revCount
- $this->revCountLast
;
248 $portion = $this->revCount
/ $this->maxCount
;
249 $eta = $this->startTime +
$deltaAll / $portion;
250 $etats = wfTimestamp( TS_DB
, intval( $eta ) );
251 if ( $this->fetchCount
) {
252 $fetchRate = 100.0 * $this->prefetchCount
/ $this->fetchCount
;
256 $pageRate = $this->pageCount
/ $deltaAll;
257 $revRate = $this->revCount
/ $deltaAll;
265 if ( $this->fetchCountLast
) {
266 $fetchRatePart = 100.0 * $this->prefetchCountLast
/ $this->fetchCountLast
;
268 $fetchRatePart = '-';
270 $pageRatePart = $this->pageCountPart
/ $deltaPart;
271 $revRatePart = $this->revCountPart
/ $deltaPart;
274 $fetchRatePart = '-';
278 $this->progress( sprintf( "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), %d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% prefetched (all|curr), ETA %s [max %d]",
279 $now, wfWikiID(), $this->ID
, $this->pageCount
, $pageRate, $pageRatePart, $this->revCount
, $revRate, $revRatePart, $fetchRate, $fetchRatePart, $etats, $this->maxCount
) );
280 $this->lastTime
= $nowts;
281 $this->revCountLast
= $this->revCount
;
282 $this->prefetchCountLast
= $this->prefetchCount
;
283 $this->fetchCountLast
= $this->fetchCount
;
287 function setTimeExceeded() {
288 $this->timeExceeded
= True;
291 function checkIfTimeExceeded() {
292 if ( $this->maxTimeAllowed
&& ( $this->lastTime
- $this->timeOfCheckpoint
> $this->maxTimeAllowed
) ) {
298 function finalOptionCheck() {
299 if ( ( $this->checkpointFiles
&& ! $this->maxTimeAllowed
) ||
300 ( $this->maxTimeAllowed
&& !$this->checkpointFiles
) ) {
301 throw new MWException("Options checkpointfile and maxtime must be specified together.\n");
303 foreach ($this->checkpointFiles
as $checkpointFile) {
304 $count = substr_count ( $checkpointFile,"%s" );
306 throw new MWException("Option checkpointfile must contain two '%s' for substitution of first and last pageids, count is $count instead, file is $checkpointFile.\n");
310 if ( $this->checkpointFiles
) {
311 $filenameList = (array)$this->egress
->getFilenames();
312 if ( count( $filenameList ) != count( $this->checkpointFiles
) ) {
313 throw new MWException("One checkpointfile must be specified for each output option, if maxtime is used.\n");
318 function readDump( $input ) {
320 $this->openElement
= false;
321 $this->atStart
= true;
323 $this->lastName
= "";
327 $parser = xml_parser_create( "UTF-8" );
328 xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING
, false );
330 xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
331 xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
333 $offset = 0; // for context extraction on error reporting
334 $bufferSize = 512 * 1024;
336 if ($this->checkIfTimeExceeded()) {
337 $this->setTimeExceeded();
339 $chunk = fread( $input, $bufferSize );
340 if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
341 wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
342 return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
344 $offset +
= strlen( $chunk );
345 } while ( $chunk !== false && !feof( $input ) );
346 if ($this->maxTimeAllowed
) {
347 $filenameList = (array)$this->egress
->getFilenames();
348 // we wrote some stuff after last checkpoint that needs renamed
349 if (file_exists($filenameList[0])) {
350 $newFilenames = array();
351 # we might have just written the header and footer and had no
352 # pages or revisions written... perhaps they were all deleted
353 # there's no pageID 0 so we use that. the caller is responsible
354 # for deciding what to do with a file containing only the
355 # siteinfo information and the mw tags.
356 if (! $this->firstPageWritten
) {
357 $firstPageID = str_pad(0,9,"0",STR_PAD_LEFT
);
358 $lastPageID = str_pad(0,9,"0",STR_PAD_LEFT
);
361 $firstPageID = str_pad($this->firstPageWritten
,9,"0",STR_PAD_LEFT
);
362 $lastPageID = str_pad($this->lastPageWritten
,9,"0",STR_PAD_LEFT
);
364 for ( $i = 0; $i < count( $filenameList ); $i++
) {
365 $checkpointNameFilledIn = sprintf( $this->checkpointFiles
[$i], $firstPageID, $lastPageID );
366 $fileinfo = pathinfo($filenameList[$i]);
367 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
369 $this->egress
->closeAndRename( $newFilenames );
372 xml_parser_free( $parser );
378 * Tries to get the revision text for a revision id.
380 * Upon errors, retries (Up to $this->maxFailures tries each call).
381 * If still no good revision get could be found even after this retrying, "" is returned.
382 * If no good revision text could be returned for
383 * $this->maxConsecutiveFailedTextRetrievals consecutive calls to getText, MWException
386 * @param $id string The revision id to get the text for
388 * @return string The revision text for $id, or ""
389 * @throws MWException
391 function getText( $id ) {
392 $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
393 $text = false; // The candidate for a good text. false if no proper value.
394 $failures = 0; // The number of times, this invocation of getText already failed.
396 static $consecutiveFailedTextRetrievals = 0; // The number of times getText failed without
397 // yielding a good text in between.
401 // To allow to simply return on success and do not have to worry about book keeping,
402 // we assume, this fetch works (possible after some retries). Nevertheless, we koop
403 // the old value, so we can restore it, if problems occur (See after the while loop).
404 $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
405 $consecutiveFailedTextRetrievals = 0;
407 while ( $failures < $this->maxFailures
) {
409 // As soon as we found a good text for the $id, we will return immediately.
410 // Hence, if we make it past the try catch block, we know that we did not
414 // Step 1: Get some text (or reuse from previous iteratuon if checking
415 // for plausibility failed)
417 // Trying to get prefetch, if it has not been tried before
418 if ( $text === false && isset( $this->prefetch
) && $prefetchNotTried ) {
419 $prefetchNotTried = false;
420 $tryIsPrefetch = true;
421 $text = $this->prefetch
->prefetch( $this->thisPage
, $this->thisRev
);
422 if ( $text === null ) {
427 if ( $text === false ) {
428 // Fallback to asking the database
429 $tryIsPrefetch = false;
430 if ( $this->spawn
) {
431 $text = $this->getTextSpawned( $id );
433 $text = $this->getTextDb( $id );
437 if ( $text === false ) {
438 throw new MWException( "Generic error while obtaining text for id " . $id );
441 // We received a good candidate for the text of $id via some method
443 // Step 2: Checking for plausibility and return the text if it is
445 $revID = intval( $this->thisRev
);
446 if ( ! isset( $this->db
) ) {
447 throw new MWException( "No database available" );
449 $revLength = $this->db
->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
450 if( strlen( $text ) == $revLength ) {
451 if ( $tryIsPrefetch ) {
452 $this->prefetchCount++
;
458 throw new MWException( "Received text is unplausible for id " . $id );
460 } catch (Exception
$e) {
461 $msg = "getting/checking text " . $id . " failed (".$e->getMessage().")";
462 if ( $failures +
1 < $this->maxFailures
) {
463 $msg .= " (Will retry " . ( $this->maxFailures
- $failures - 1) . " more times)";
465 $this->progress( $msg );
468 // Something went wrong; we did not a text that was plausible :(
472 // After backing off for some time, we try to reboot the whole process as
473 // much as possible to not carry over failures from one part to the other
475 sleep( $this->failureTimeout
);
478 if ( $this->spawn
) {
482 } catch (Exception
$e) {
483 $this->progress( "Rebooting getText infrastructure failed (".$e->getMessage().")" .
484 " Trying to continue anyways" );
488 // Retirieving a good text for $id failed (at least) maxFailures times.
489 // We abort for this $id.
491 // Restoring the consecutive failures, and maybe aborting, if the dump
493 $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals +
1;
494 if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals
) {
495 throw new MWException( "Graceful storage failure" );
503 * May throw a database error if, say, the server dies during query.
505 * @return bool|string
506 * @throws MWException
508 private function getTextDb( $id ) {
510 if ( ! isset( $this->db
) ) {
511 throw new MWException( __METHOD__
. "No database available" );
513 $row = $this->db
->selectRow( 'text',
514 array( 'old_text', 'old_flags' ),
515 array( 'old_id' => $id ),
517 $text = Revision
::getRevisionText( $row );
518 if ( $text === false ) {
521 $stripped = str_replace( "\r", "", $text );
522 $normalized = $wgContLang->normalize( $stripped );
526 private function getTextSpawned( $id ) {
527 wfSuppressWarnings();
528 if ( !$this->spawnProc
) {
532 $text = $this->getTextSpawnedOnce( $id );
537 function openSpawn() {
540 if ( file_exists( "$IP/../multiversion/MWScript.php" ) ) {
542 array_map( 'wfEscapeShellArg',
545 "$IP/../multiversion/MWScript.php",
547 '--wiki', wfWikiID() ) ) );
551 array_map( 'wfEscapeShellArg',
554 "$IP/maintenance/fetchText.php",
555 '--wiki', wfWikiID() ) ) );
558 0 => array( "pipe", "r" ),
559 1 => array( "pipe", "w" ),
560 2 => array( "file", "/dev/null", "a" ) );
563 $this->progress( "Spawning database subprocess: $cmd" );
564 $this->spawnProc
= proc_open( $cmd, $spec, $pipes );
565 if ( !$this->spawnProc
) {
567 $this->progress( "Subprocess spawn failed." );
571 $this->spawnWrite
, // -> stdin
572 $this->spawnRead
, // <- stdout
578 private function closeSpawn() {
579 wfSuppressWarnings();
580 if ( $this->spawnRead
)
581 fclose( $this->spawnRead
);
582 $this->spawnRead
= false;
583 if ( $this->spawnWrite
)
584 fclose( $this->spawnWrite
);
585 $this->spawnWrite
= false;
586 if ( $this->spawnErr
)
587 fclose( $this->spawnErr
);
588 $this->spawnErr
= false;
589 if ( $this->spawnProc
)
590 pclose( $this->spawnProc
);
591 $this->spawnProc
= false;
595 private function getTextSpawnedOnce( $id ) {
598 $ok = fwrite( $this->spawnWrite
, "$id\n" );
599 // $this->progress( ">> $id" );
600 if ( !$ok ) return false;
602 $ok = fflush( $this->spawnWrite
);
603 // $this->progress( ">> [flush]" );
604 if ( !$ok ) return false;
606 // check that the text id they are sending is the one we asked for
607 // this avoids out of sync revision text errors we have encountered in the past
608 $newId = fgets( $this->spawnRead
);
609 if ( $newId === false ) {
612 if ( $id != intval( $newId ) ) {
616 $len = fgets( $this->spawnRead
);
617 // $this->progress( "<< " . trim( $len ) );
618 if ( $len === false ) return false;
620 $nbytes = intval( $len );
621 // actual error, not zero-length text
622 if ($nbytes < 0 ) return false;
626 // Subprocess may not send everything at once, we have to loop.
627 while ( $nbytes > strlen( $text ) ) {
628 $buffer = fread( $this->spawnRead
, $nbytes - strlen( $text ) );
629 if ( $buffer === false ) break;
633 $gotbytes = strlen( $text );
634 if ( $gotbytes != $nbytes ) {
635 $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
639 // Do normalization in the dump thread...
640 $stripped = str_replace( "\r", "", $text );
641 $normalized = $wgContLang->normalize( $stripped );
645 function startElement( $parser, $name, $attribs ) {
646 $this->checkpointJustWritten
= false;
648 $this->clearOpenElement( null );
649 $this->lastName
= $name;
651 if ( $name == 'revision' ) {
652 $this->state
= $name;
653 $this->egress
->writeOpenPage( null, $this->buffer
);
655 } elseif ( $name == 'page' ) {
656 $this->state
= $name;
657 if ( $this->atStart
) {
658 $this->egress
->writeOpenStream( $this->buffer
);
660 $this->atStart
= false;
664 if ( $name == "text" && isset( $attribs['id'] ) ) {
665 $text = $this->getText( $attribs['id'] );
666 $this->openElement
= array( $name, array( 'xml:space' => 'preserve' ) );
667 if ( strlen( $text ) > 0 ) {
668 $this->characterData( $parser, $text );
671 $this->openElement
= array( $name, $attribs );
675 function endElement( $parser, $name ) {
676 $this->checkpointJustWritten
= false;
678 if ( $this->openElement
) {
679 $this->clearOpenElement( "" );
681 $this->buffer
.= "</$name>";
684 if ( $name == 'revision' ) {
685 $this->egress
->writeRevision( null, $this->buffer
);
688 } elseif ( $name == 'page' ) {
689 if (! $this->firstPageWritten
) {
690 $this->firstPageWritten
= trim($this->thisPage
);
692 $this->lastPageWritten
= trim($this->thisPage
);
693 if ($this->timeExceeded
) {
694 $this->egress
->writeClosePage( $this->buffer
);
695 // nasty hack, we can't just write the chardata after the
696 // page tag, it will include leading blanks from the next line
697 $this->egress
->sink
->write("\n");
699 $this->buffer
= $this->xmlwriterobj
->closeStream();
700 $this->egress
->writeCloseStream( $this->buffer
);
703 $this->thisPage
= "";
704 // this could be more than one file if we had more than one output arg
706 $filenameList = (array)$this->egress
->getFilenames();
707 $newFilenames = array();
708 $firstPageID = str_pad($this->firstPageWritten
,9,"0",STR_PAD_LEFT
);
709 $lastPageID = str_pad($this->lastPageWritten
,9,"0",STR_PAD_LEFT
);
710 for ( $i = 0; $i < count( $filenameList ); $i++
) {
711 $checkpointNameFilledIn = sprintf( $this->checkpointFiles
[$i], $firstPageID, $lastPageID );
712 $fileinfo = pathinfo($filenameList[$i]);
713 $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
715 $this->egress
->closeRenameAndReopen( $newFilenames );
716 $this->buffer
= $this->xmlwriterobj
->openStream();
717 $this->timeExceeded
= false;
718 $this->timeOfCheckpoint
= $this->lastTime
;
719 $this->firstPageWritten
= false;
720 $this->checkpointJustWritten
= true;
723 $this->egress
->writeClosePage( $this->buffer
);
725 $this->thisPage
= "";
728 } elseif ( $name == 'mediawiki' ) {
729 $this->egress
->writeCloseStream( $this->buffer
);
734 function characterData( $parser, $data ) {
735 $this->clearOpenElement( null );
736 if ( $this->lastName
== "id" ) {
737 if ( $this->state
== "revision" ) {
738 $this->thisRev
.= $data;
739 } elseif ( $this->state
== "page" ) {
740 $this->thisPage
.= $data;
743 // have to skip the newline left over from closepagetag line of
744 // end of checkpoint files. nasty hack!!
745 if ($this->checkpointJustWritten
) {
746 if ($data[0] == "\n") {
747 $data = substr($data,1);
749 $this->checkpointJustWritten
= false;
751 $this->buffer
.= htmlspecialchars( $data );
754 function clearOpenElement( $style ) {
755 if ( $this->openElement
) {
756 $this->buffer
.= Xml
::element( $this->openElement
[0], $this->openElement
[1], $style );
757 $this->openElement
= false;
763 $dumper = new TextPassDumper( $argv );
765 if ( !isset( $options['help'] ) ) {
766 $dumper->dump( true );
768 $dumper->progress( <<<ENDS
769 This script postprocesses XML dumps from dumpBackup.php to add
770 page text which was stubbed out (using --stub).
772 XML input is accepted on stdin.
773 XML output is sent to stdout; progress reports are sent to stderr.
775 Usage: php dumpTextPass.php [<options>]
777 --stub=<type>:<file> To load a compressed stub dump instead of stdin
778 --prefetch=<type>:<file> Use a prior dump file as a text source, to save
779 pressure on the database.
780 (Requires the XMLReader extension)
781 --maxtime=<minutes> Write out checkpoint file after this many minutes (writing
782 out complete page, closing xml file properly, and opening new one
783 with header). This option requires the checkpointfile option.
784 --checkpointfile=<filenamepattern> Use this string for checkpoint filenames,
785 substituting first pageid written for the first %s (required) and the
786 last pageid written for the second %s if it exists.
787 --quiet Don't dump status reports to stderr.
788 --report=n Report position and speed after every n pages processed.
790 --server=h Force reading from MySQL server h
791 --current Base ETA on number of pages in database instead of all revisions
792 --spawn Spawn a subprocess for loading text records
793 --help Display this help message