From: Alexandre Emsenhuber Date: Fri, 3 Dec 2010 10:05:01 +0000 (+0000) Subject: Modifier dumpTextPass.php to use XMLReader and XMLWriter rather than xml_* functions X-Git-Tag: 1.31.0-rc.0~33621 X-Git-Url: http://git.cyclocoop.org/%7B%24admin_url%7Dmes_infos.php?a=commitdiff_plain;h=6b190f772eeaddcc97256356d0307ed431dc73d0;p=lhc%2Fweb%2Fwiklou.git Modifier dumpTextPass.php to use XMLReader and XMLWriter rather than xml_* functions --- diff --git a/maintenance/dumpTextPass.php b/maintenance/dumpTextPass.php index 3a06cce56b..6dcf6608eb 100644 --- a/maintenance/dumpTextPass.php +++ b/maintenance/dumpTextPass.php @@ -2,7 +2,7 @@ /** * Script that postprocesses XML dumps from dumpBackup.php to add page text * - * Copyright (C) 2005 Brion Vibber + * Copyright © 2005 Brion Vibber , 2010 Alexandre Emsenhuber * http://www.mediawiki.org/ * * This program is free software; you can redistribute it and/or modify @@ -35,7 +35,6 @@ require_once( 'backup.inc' ); class TextPassDumper extends BackupDumper { var $prefetch = null; var $input = "php://stdin"; - var $history = WikiExporter::FULL; var $fetchCount = 0; var $prefetchCount = 0; @@ -61,18 +60,11 @@ class TextPassDumper extends BackupDumper { if ( ini_get( 'display_errors' ) ) ini_set( 'display_errors', 'stderr' ); - $this->initProgress( $this->history ); + $this->initProgress( $history ); $this->db = $this->backupDb(); - $this->egress = new ExportProgressFilter( $this->sink, $this ); - - $input = fopen( $this->input, "rt" ); - $result = $this->readDump( $input ); - - if ( WikiError::isError( $result ) ) { - wfDie( $result->getMessage() ); - } + $this->readDump(); if ( $this->spawnProc ) { $this->closeSpawn(); @@ -93,12 +85,6 @@ class TextPassDumper extends BackupDumper { case 'stub': $this->input = $url; break; - case 'current': - $this->history = WikiExporter::CURRENT; - break; - case 'full': - $this->history = WikiExporter::FULL; - break; case 'spawn': $this->spawn = true; if ( $val ) { @@ -152,34 +138,76 @@ class TextPassDumper extends BackupDumper { } } - function readDump( $input ) { - $this->buffer = ""; - $this->openElement = false; - $this->atStart = true; - $this->state = ""; - $this->lastName = ""; + function readDump() { + $state = ''; + $lastName = ''; $this->thisPage = 0; $this->thisRev = 0; - $parser = xml_parser_create( "UTF-8" ); - xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); + $reader = new XMLReader(); + $reader->open( $this->input ); + $writer = new XMLWriter(); + $writer->openURI( 'php://stdout' ); - xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) ); - xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) ); - $offset = 0; // for context extraction on error reporting - $bufferSize = 512 * 1024; - do { - $chunk = fread( $input, $bufferSize ); - if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { - wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" ); - return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset ); - } - $offset += strlen( $chunk ); - } while ( $chunk !== false && !feof( $input ) ); - xml_parser_free( $parser ); + while ( $reader->read() ) { + $tag = $reader->name; + $type = $reader->nodeType; - return true; + if ( $type == XmlReader::END_ELEMENT ) { + $writer->endElement(); + + if ( $tag == 'revision' ) { + $this->revCount(); + $this->thisRev = ''; + } elseif ( $tag == 'page' ) { + $this->reportPage(); + $this->thisPage = ''; + } + } elseif ( $type == XmlReader::ELEMENT ) { + $attribs = array(); + if ( $reader->hasAttributes ) { + for ( $i = 0; $reader->moveToAttributeNo( $i ); $i++ ) { + $attribs[$reader->name] = $reader->value; + } + } + + if ( $reader->isEmptyElement && $tag == 'text' && isset( $attribs['id'] ) ) { + $writer->startElement( 'text' ); + $writer->writeAttribute( 'xml:space', 'preserve' ); + $text = $this->getText( $attribs['id'] ); + if ( strlen( $text ) ) { + $writer->text( $text ); + } + $writer->endElement(); + } else { + $writer->startElement( $tag ); + foreach( $attribs as $name => $val ) { + $writer->writeAttribute( $name, $val ); + } + if ( $reader->isEmptyElement ) { + $writer->endElement(); + } + } + + $lastName = $tag; + if ( $tag == 'revision' ) { + $state = 'revision'; + } elseif ( $tag == 'page' ) { + $state = 'page'; + } + } elseif ( $type == XMLReader::SIGNIFICANT_WHITESPACE || $type = XMLReader::TEXT ) { + if ( $lastName == 'id' ) { + if ( $state == 'revision' ) { + $this->thisRev .= $reader->value; + } elseif ( $state == 'page' ) { + $this->thisPage .= $reader->value; + } + } + $writer->text( $reader->value ); + } + } + $writer->flush(); } function getText( $id ) { @@ -207,7 +235,6 @@ class TextPassDumper extends BackupDumper { } private function doGetText( $id ) { - $id = intval( $id ); $this->failures = 0; $ex = new MWException( "Graceful storage failure" ); @@ -395,81 +422,13 @@ class TextPassDumper extends BackupDumper { $normalized = $wgContLang->normalize( $stripped ); return $normalized; } - - function startElement( $parser, $name, $attribs ) { - $this->clearOpenElement( null ); - $this->lastName = $name; - - if ( $name == 'revision' ) { - $this->state = $name; - $this->egress->writeOpenPage( null, $this->buffer ); - $this->buffer = ""; - } elseif ( $name == 'page' ) { - $this->state = $name; - if ( $this->atStart ) { - $this->egress->writeOpenStream( $this->buffer ); - $this->buffer = ""; - $this->atStart = false; - } - } - - if ( $name == "text" && isset( $attribs['id'] ) ) { - $text = $this->getText( $attribs['id'] ); - $this->openElement = array( $name, array( 'xml:space' => 'preserve' ) ); - if ( strlen( $text ) > 0 ) { - $this->characterData( $parser, $text ); - } - } else { - $this->openElement = array( $name, $attribs ); - } - } - - function endElement( $parser, $name ) { - if ( $this->openElement ) { - $this->clearOpenElement( "" ); - } else { - $this->buffer .= ""; - } - - if ( $name == 'revision' ) { - $this->egress->writeRevision( null, $this->buffer ); - $this->buffer = ""; - $this->thisRev = ""; - } elseif ( $name == 'page' ) { - $this->egress->writeClosePage( $this->buffer ); - $this->buffer = ""; - $this->thisPage = ""; - } elseif ( $name == 'mediawiki' ) { - $this->egress->writeCloseStream( $this->buffer ); - $this->buffer = ""; - } - } - - function characterData( $parser, $data ) { - $this->clearOpenElement( null ); - if ( $this->lastName == "id" ) { - if ( $this->state == "revision" ) { - $this->thisRev .= $data; - } elseif ( $this->state == "page" ) { - $this->thisPage .= $data; - } - } - $this->buffer .= htmlspecialchars( $data ); - } - - function clearOpenElement( $style ) { - if ( $this->openElement ) { - $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); - $this->openElement = false; - } - } } $dumper = new TextPassDumper( $argv ); if ( !isset( $options['help'] ) ) { - $dumper->dump( true ); + $dumper->dump( WikiExporter::FULL ); } else { $dumper->progress( <<: To load a compressed stub dump instead of stdin --prefetch=: Use a prior dump file as a text source, to save pressure on the database. - (Requires the XMLReader extension) --quiet Don't dump status reports to stderr. --report=n Report position and speed after every n pages processed. (Default: 100)