*/
class TextPassDumper extends BackupDumper {
public $prefetch = null;
+
+ // when we spend more than maxTimeAllowed seconds on this run, we continue
+ // processing until we write out the next complete page, then save output file(s),
+ // rename it/them and open new one(s)
+ public $maxTimeAllowed = 0; // 0 = no limit
+
protected $input = "php://stdin";
protected $history = WikiExporter::FULL;
protected $fetchCount = 0;
protected $xmlwriterobj = false;
- // when we spend more than maxTimeAllowed seconds on this run, we continue
- // processing until we write out the next complete page, then save output file(s),
- // rename it/them and open new one(s)
- protected $maxTimeAllowed = 0; // 0 = no limit
protected $timeExceeded = false;
protected $firstPageWritten = false;
protected $lastPageWritten = false;
if ( $this->forcedDb !== null ) {
$this->db = $this->forcedDb;
+
return;
}
$url = $this->processFileOpt( $val, $param );
switch ( $opt ) {
- case 'prefetch':
- require_once "$IP/maintenance/backupPrefetch.inc";
- $this->prefetch = new BaseDump( $url );
- break;
- case 'stub':
- $this->input = $url;
- break;
- case 'maxtime':
- $this->maxTimeAllowed = intval( $val ) * 60;
- break;
- case 'checkpointfile':
- $this->checkpointFiles[] = $val;
- break;
- case 'current':
- $this->history = WikiExporter::CURRENT;
- break;
- case 'full':
- $this->history = WikiExporter::FULL;
- break;
- case 'spawn':
- $this->spawn = true;
- if ( $val ) {
- $this->php = $val;
- }
- break;
+ case 'prefetch':
+ require_once "$IP/maintenance/backupPrefetch.inc";
+ $this->prefetch = new BaseDump( $url );
+ break;
+ case 'stub':
+ $this->input = $url;
+ break;
+ case 'maxtime':
+ $this->maxTimeAllowed = intval( $val ) * 60;
+ break;
+ case 'checkpointfile':
+ $this->checkpointFiles[] = $val;
+ break;
+ case 'current':
+ $this->history = WikiExporter::CURRENT;
+ break;
+ case 'full':
+ $this->history = WikiExporter::FULL;
+ break;
+ case 'spawn':
+ $this->spawn = true;
+ if ( $val ) {
+ $this->php = $val;
+ }
+ break;
}
}
$newFileURIs[] = $newURI;
}
$val = implode( ';', $newFileURIs );
+
return $val;
}
function showReport() {
if ( !$this->prefetch ) {
parent::showReport();
+
return;
}
}
$pageRatePart = $this->pageCountPart / $deltaPart;
$revRatePart = $this->revCountPart / $deltaPart;
-
} else {
$fetchRatePart = '-';
$pageRatePart = '-';
}
function finalOptionCheck() {
- if ( ( $this->checkpointFiles && ! $this->maxTimeAllowed ) ||
- ( $this->maxTimeAllowed && !$this->checkpointFiles ) ) {
+ if ( ( $this->checkpointFiles && !$this->maxTimeAllowed )
+ || ( $this->maxTimeAllowed && !$this->checkpointFiles )
+ ) {
throw new MWException( "Options checkpointfile and maxtime must be specified together.\n" );
}
foreach ( $this->checkpointFiles as $checkpointFile ) {
- $count = substr_count ( $checkpointFile, "%s" );
+ $count = substr_count( $checkpointFile, "%s" );
if ( $count != 2 ) {
throw new MWException( "Option checkpointfile must contain two '%s' "
. "for substitution of first and last pageids, count is $count instead, "
/**
* @throws MWException Failure to parse XML input
+ * @param string $input
* @return bool
*/
function readDump( $input ) {
$this->lastName = "";
$this->thisPage = 0;
$this->thisRev = 0;
+ $this->thisRevModel = null;
+ $this->thisRevFormat = null;
$parser = xml_parser_create( "UTF-8" );
xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
'XML import parse failure',
xml_get_current_line_number( $parser ),
xml_get_current_column_number( $parser ),
- $byte . ( is_null( $chunk ) ? null : ( '; "' . substr( $chunk, $byte -$offset, 16 ) . '"' ) ),
+ $byte . ( is_null( $chunk ) ? null : ( '; "' . substr( $chunk, $byte - $offset, 16 ) . '"' ) ),
xml_error_string( xml_get_error_code( $parser ) ) )->escaped();
xml_parser_free( $parser );
# there's no pageID 0 so we use that. the caller is responsible
# for deciding what to do with a file containing only the
# siteinfo information and the mw tags.
- if ( ! $this->firstPageWritten ) {
+ if ( !$this->firstPageWritten ) {
$firstPageID = str_pad( 0, 9, "0", STR_PAD_LEFT );
$lastPageID = str_pad( 0, 9, "0", STR_PAD_LEFT );
} else {
return true;
}
+ /**
+ * Applies applicable export transformations to $text.
+ *
+ * @param string $text
+ * @param string $model
+ * @param string|null $format
+ *
+ * @return string
+ */
+ private function exportTransform( $text, $model, $format = null ) {
+ try {
+ $handler = ContentHandler::getForModelID( $model );
+ $text = $handler->exportTransform( $text, $format );
+ }
+ catch ( MWException $ex ) {
+ $this->progress(
+ "Unable to apply export transformation for content model '$model': " .
+ $ex->getMessage()
+ );
+ }
+
+ return $text;
+ }
+
/**
* Tries to get the revision text for a revision id.
+ * Export transformations are applied if the content model can is given or can be
+ * determined from the database.
*
* Upon errors, retries (Up to $this->maxFailures tries each call).
* If still no good revision get could be found even after this retrying, "" is returned.
* is thrown.
*
* @param string $id The revision id to get the text for
+ * @param string|bool|null $model The content model used to determine applicable export transformations.
+ * If $model is null, it will be determined from the database.
+ * @param string|null $format The content format used when applying export transformations.
*
- * @return string The revision text for $id, or ""
* @throws MWException
+ * @return string The revision text for $id, or ""
*/
- function getText( $id ) {
+ function getText( $id, $model = null, $format = null ) {
global $wgContentHandlerUseDB;
$prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
$oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
$consecutiveFailedTextRetrievals = 0;
+ if ( $model === null && $wgContentHandlerUseDB ) {
+ $row = $this->db->selectRow(
+ 'revision',
+ array( 'rev_content_model', 'rev_content_format' ),
+ array( 'rev_id' => $this->thisRev ),
+ __METHOD__
+ );
+
+ if ( $row ) {
+ $model = $row->rev_content_model;
+ $format = $row->rev_content_format;
+ }
+ }
+
+ if ( $model === null || $model === '' ) {
+ $model = false;
+ }
+
while ( $failures < $this->maxFailures ) {
// As soon as we found a good text for the $id, we will return immediately.
$tryIsPrefetch = true;
$text = $this->prefetch->prefetch( intval( $this->thisPage ),
intval( $this->thisRev ) );
+
if ( $text === null ) {
$text = false;
}
+
+ if ( is_string( $text ) && $model !== false ) {
+ // Apply export transformation to text coming from an old dump.
+ // The purpose of this transformation is to convert up from legacy
+ // formats, which may still be used in the older dump that is used
+ // for pre-fetching. Applying the transformation again should not
+ // interfere with content that is already in the correct form.
+ $text = $this->exportTransform( $text, $model, $format );
+ }
}
if ( $text === false ) {
$text = $this->getTextDb( $id );
}
+ if ( $text !== false && $model !== false ) {
+ // Apply export transformation to text coming from the database.
+ // Prefetched text should already have transformations applied.
+ $text = $this->exportTransform( $text, $model, $format );
+ }
+
// No more checks for texts from DB for now.
// If we received something that is not false,
// We treat it as good text, regardless of whether it actually is or is not
// Step 2: Checking for plausibility and return the text if it is
// plausible
$revID = intval( $this->thisRev );
- if ( ! isset( $this->db ) ) {
+ if ( !isset( $this->db ) ) {
throw new MWException( "No database available" );
}
- $revLength = strlen( $text );
- if ( $wgContentHandlerUseDB ) {
- $row = $this->db->selectRow(
- 'revision',
- array( 'rev_len', 'rev_content_model' ),
- array( 'rev_id' => $revID ),
- __METHOD__
- );
- if ( $row ) {
- // only check the length for the wikitext content handler,
- // it's a wasted (and failed) check otherwise
- if ( $row->rev_content_model == CONTENT_MODEL_WIKITEXT ) {
- $revLength = $row->rev_len;
- }
- }
-
- }
- else {
+ if ( $model !== CONTENT_MODEL_WIKITEXT ) {
+ $revLength = strlen( $text );
+ } else {
$revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
}
if ( $tryIsPrefetch ) {
$this->prefetchCount++;
}
+
return $text;
}
$text = false;
throw new MWException( "Received text is unplausible for id " . $id );
-
} catch ( Exception $e ) {
$msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")";
if ( $failures + 1 < $this->maxFailures ) {
$failures++;
// A failure in a prefetch hit does not warrant resetting db connection etc.
- if ( ! $tryIsPrefetch ) {
+ if ( !$tryIsPrefetch ) {
// After backing off for some time, we try to reboot the whole process as
// much as possible to not carry over failures from one part to the other
// parts
*/
private function getTextDb( $id ) {
global $wgContLang;
- if ( ! isset( $this->db ) ) {
+ if ( !isset( $this->db ) ) {
throw new MWException( __METHOD__ . "No database available" );
}
$row = $this->db->selectRow( 'text',
}
$stripped = str_replace( "\r", "", $text );
$normalized = $wgContLang->normalize( $stripped );
+
return $normalized;
}
}
$text = $this->getTextSpawnedOnce( $id );
wfRestoreWarnings();
+
return $text;
}
"$IP/../multiversion/MWScript.php",
"fetchText.php",
'--wiki', wfWikiID() ) ) );
- }
- else {
+ } else {
$cmd = implode( " ",
array_map( 'wfEscapeShellArg',
array(
if ( !$this->spawnProc ) {
// shit
$this->progress( "Subprocess spawn failed." );
+
return false;
}
list(
$this->spawnWrite, // -> stdin
- $this->spawnRead, // <- stdout
+ $this->spawnRead, // <- stdout
) = $pipes;
return true;
$gotbytes = strlen( $text );
if ( $gotbytes != $nbytes ) {
$this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
+
return false;
}
// Do normalization in the dump thread...
$stripped = str_replace( "\r", "", $text );
$normalized = $wgContLang->normalize( $stripped );
+
return $normalized;
}
}
if ( $name == "text" && isset( $attribs['id'] ) ) {
- $text = $this->getText( $attribs['id'] );
+ $id = $attribs['id'];
+ $model = trim( $this->thisRevModel );
+ $format = trim( $this->thisRevFormat );
+
+ $model = $model === '' ? null : $model;
+ $format = $format === '' ? null : $format;
+
+ $text = $this->getText( $id, $model, $format );
$this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
if ( strlen( $text ) > 0 ) {
$this->characterData( $parser, $text );
$this->egress->writeRevision( null, $this->buffer );
$this->buffer = "";
$this->thisRev = "";
+ $this->thisRevModel = null;
+ $this->thisRevFormat = null;
} elseif ( $name == 'page' ) {
- if ( ! $this->firstPageWritten ) {
+ if ( !$this->firstPageWritten ) {
$this->firstPageWritten = trim( $this->thisPage );
}
$this->lastPageWritten = trim( $this->thisPage );
$this->buffer = "";
$this->thisPage = "";
}
-
} elseif ( $name == 'mediawiki' ) {
$this->egress->writeCloseStream( $this->buffer );
$this->buffer = "";
$this->thisPage .= $data;
}
}
+ elseif ( $this->lastName == "model" ) {
+ $this->thisRevModel .= $data;
+ }
+ elseif ( $this->lastName == "format" ) {
+ $this->thisRevFormat .= $data;
+ }
+
// have to skip the newline left over from closepagetag line of
// end of checkpoint files. nasty hack!!
if ( $this->checkpointJustWritten ) {