From b1bfcaeca42cdcf19fc9b334053cf212d13cbc76 Mon Sep 17 00:00:00 2001 From: Chad Horohoe Date: Wed, 20 Jul 2011 23:06:24 +0000 Subject: [PATCH] Revert r85034, r81186, r77638: per CR on r77638: XMLReader and XMLWriter are memory-hungry beasts and this script OOMs constantly. This was already reverted in 1.17wmf1 (r82930) by Ariel. Doing the same in trunk now. I'm not opposed to using the new pretty XML* classes, but somebody needs to debug why they eat memory for breakfast, lunch and dinner. --- maintenance/dumpTextPass.php | 180 +++++++++++++++++++++-------------- 1 file changed, 110 insertions(+), 70 deletions(-) diff --git a/maintenance/dumpTextPass.php b/maintenance/dumpTextPass.php index 109584cc13..187a38c49d 100644 --- a/maintenance/dumpTextPass.php +++ b/maintenance/dumpTextPass.php @@ -2,7 +2,7 @@ /** * Script that postprocesses XML dumps from dumpBackup.php to add page text * - * Copyright © 2005 Brion Vibber , 2010 Alexandre Emsenhuber + * Copyright (C) 2005 Brion Vibber * http://www.mediawiki.org/ * * This program is free software; you can redistribute it and/or modify @@ -35,6 +35,7 @@ require_once( 'backup.inc' ); class TextPassDumper extends BackupDumper { var $prefetch = null; var $input = "php://stdin"; + var $history = WikiExporter::FULL; var $fetchCount = 0; var $prefetchCount = 0; var $lastTime = 0; @@ -73,11 +74,18 @@ class TextPassDumper extends BackupDumper { if ( ini_get( 'display_errors' ) ) ini_set( 'display_errors', 'stderr' ); - $this->initProgress( $history ); + $this->initProgress( $this->history ); $this->db = $this->backupDb(); - $this->readDump(); + $this->egress = new ExportProgressFilter( $this->sink, $this ); + + $input = fopen( $this->input, "rt" ); + $result = $this->readDump( $input ); + + if ( WikiError::isError( $result ) ) { + wfDie( $result->getMessage() ); + } if ( $this->spawnProc ) { $this->closeSpawn(); @@ -98,6 +106,12 @@ class TextPassDumper extends BackupDumper { case 'stub': $this->input = $url; break; + case 'current': + $this->history = WikiExporter::CURRENT; + break; + case 'full': + $this->history = WikiExporter::FULL; + break; case 'spawn': $this->spawn = true; if ( $val ) { @@ -190,76 +204,34 @@ class TextPassDumper extends BackupDumper { } } - function readDump() { - $state = ''; - $lastName = ''; + function readDump( $input ) { + $this->buffer = ""; + $this->openElement = false; + $this->atStart = true; + $this->state = ""; + $this->lastName = ""; $this->thisPage = 0; $this->thisRev = 0; - $reader = new XMLReader(); - $reader->open( $this->input ); - $writer = new XMLWriter(); - $writer->openMemory(); - - - while ( $reader->read() ) { - $tag = $reader->name; - $type = $reader->nodeType; - - if ( $type == XmlReader::END_ELEMENT ) { - $writer->endElement(); - - if ( $tag == 'revision' ) { - $this->revCount(); - $this->thisRev = ''; - } elseif ( $tag == 'page' ) { - $this->reportPage(); - $this->thisPage = ''; - } - } elseif ( $type == XmlReader::ELEMENT ) { - $attribs = array(); - if ( $reader->hasAttributes ) { - for ( $i = 0; $reader->moveToAttributeNo( $i ); $i++ ) { - $attribs[$reader->name] = $reader->value; - } - } + $parser = xml_parser_create( "UTF-8" ); + xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); - if ( $reader->isEmptyElement && $tag == 'text' && isset( $attribs['id'] ) ) { - $writer->startElement( 'text' ); - $writer->writeAttribute( 'xml:space', 'preserve' ); - $text = $this->getText( $attribs['id'] ); - if ( strlen( $text ) ) { - $writer->text( $text ); - } - $writer->endElement(); - } else { - $writer->startElement( $tag ); - foreach( $attribs as $name => $val ) { - $writer->writeAttribute( $name, $val ); - } - if ( $reader->isEmptyElement ) { - $writer->endElement(); - } - } + xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) ); + xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); - $lastName = $tag; - if ( $tag == 'revision' ) { - $state = 'revision'; - } elseif ( $tag == 'page' ) { - $state = 'page'; - } - } elseif ( $type == XMLReader::SIGNIFICANT_WHITESPACE || $type == XMLReader::TEXT ) { - if ( $lastName == 'id' ) { - if ( $state == 'revision' ) { - $this->thisRev .= $reader->value; - } elseif ( $state == 'page' ) { - $this->thisPage .= $reader->value; - } - } - $writer->text( $reader->value ); + $offset = 0; // for context extraction on error reporting + $bufferSize = 512 * 1024; + do { + $chunk = fread( $input, $bufferSize ); + if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { + wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); + return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset ); } - $this->sink->write( $writer->outputMemory() ); - } + $offset += strlen( $chunk ); + } while ( $chunk !== false && !feof( $input ) ); + xml_parser_free( $parser ); + + return true; } function getText( $id ) { @@ -282,6 +254,7 @@ class TextPassDumper extends BackupDumper { } private function doGetText( $id ) { + $id = intval( $id ); $this->failures = 0; $ex = new MWException( "Graceful storage failure" ); @@ -469,13 +442,81 @@ class TextPassDumper extends BackupDumper { $normalized = $wgContLang->normalize( $stripped ); return $normalized; } + + function startElement( $parser, $name, $attribs ) { + $this->clearOpenElement( null ); + $this->lastName = $name; + + if ( $name == 'revision' ) { + $this->state = $name; + $this->egress->writeOpenPage( null, $this->buffer ); + $this->buffer = ""; + } elseif ( $name == 'page' ) { + $this->state = $name; + if ( $this->atStart ) { + $this->egress->writeOpenStream( $this->buffer ); + $this->buffer = ""; + $this->atStart = false; + } + } + + if ( $name == "text" && isset( $attribs['id'] ) ) { + $text = $this->getText( $attribs['id'] ); + $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); + if ( strlen( $text ) > 0 ) { + $this->characterData( $parser, $text ); + } + } else { + $this->openElement = array( $name, $attribs ); + } + } + + function endElement( $parser, $name ) { + if ( $this->openElement ) { + $this->clearOpenElement( "" ); + } else { + $this->buffer .= ""; + } + + if ( $name == 'revision' ) { + $this->egress->writeRevision( null, $this->buffer ); + $this->buffer = ""; + $this->thisRev = ""; + } elseif ( $name == 'page' ) { + $this->egress->writeClosePage( $this->buffer ); + $this->buffer = ""; + $this->thisPage = ""; + } elseif ( $name == 'mediawiki' ) { + $this->egress->writeCloseStream( $this->buffer ); + $this->buffer = ""; + } + } + + function characterData( $parser, $data ) { + $this->clearOpenElement( null ); + if ( $this->lastName == "id" ) { + if ( $this->state == "revision" ) { + $this->thisRev .= $data; + } elseif ( $this->state == "page" ) { + $this->thisPage .= $data; + } + } + $this->buffer .= htmlspecialchars( $data ); + } + + function clearOpenElement( $style ) { + if ( $this->openElement ) { + $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); + $this->openElement = false; + } + } } $dumper = new TextPassDumper( $argv ); if ( !isset( $options['help'] ) ) { - $dumper->dump( WikiExporter::FULL ); + $dumper->dump( true ); } else { $dumper->progress( <<: To load a compressed stub dump instead of stdin --prefetch=: Use a prior dump file as a text source, to save pressure on the database. + (Requires the XMLReader extension) --quiet Don't dump status reports to stderr. --report=n Report position and speed after every n pages processed. (Default: 100) --server=h Force reading from MySQL server h - --output=: Write to a file instead of stdout - s: file, gzip, bzip2, 7zip --current Base ETA on number of pages in database instead of all revisions --spawn Spawn a subprocess for loading text records --help Display this help message -- 2.20.1