3 * Helper class for the --prefetch option of dumpTextPass.php
10 * Readahead helper for making large MediaWiki data dumps;
11 * reads in a previous XML dump to sequentially prefetch text
12 * records already normalized and decompressed.
14 * This can save load on the external database servers, hopefully.
16 * Assumes that dumps will be recorded in the canonical order:
17 * - ascending by page_id
18 * - ascending by rev_id within each page
19 * - text contents are immutable and should not change once
20 * recorded, so the previous dump is a reliable source
22 * @ingroup Maintenance
27 var $atPageEnd = false;
31 function BaseDump( $infile ) {
32 $this->reader = new XMLReader();
33 $this->reader->open( $infile );
37 * Attempts to fetch the text of a particular page revision
38 * from the dump stream. May return null if the page is
41 * @param $page Integer: ID number of page to read
42 * @param $rev Integer: ID number of revision to read
43 * @return string or null
45 function prefetch( $page, $rev ) {
46 $page = intval( $page );
47 $rev = intval( $rev );
48 while ( $this->lastPage < $page && !$this->atEnd ) {
49 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
52 if ( $this->lastPage > $page || $this->atEnd ) {
53 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
56 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
57 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
60 if ( $this->lastRev == $rev && !$this->atEnd ) {
61 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
62 return $this->nextText();
64 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
69 function debug( $str ) {
70 wfDebug( $str . "\n" );
72 // $dumper->progress( $str );
79 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
80 if ( $this->skipTo( 'id' ) ) {
81 $this->lastPage = intval( $this->nodeContents() );
83 $this->atPageEnd = false;
94 if ( $this->skipTo( 'revision' ) ) {
95 if ( $this->skipTo( 'id' ) ) {
96 $this->lastRev = intval( $this->nodeContents() );
99 $this->atPageEnd = true;
106 function nextText() {
107 $this->skipTo( 'text' );
108 return strval( $this->nodeContents() );
114 function skipTo( $name, $parent = 'page' ) {
115 if ( $this->atEnd ) {
118 while ( $this->reader->read() ) {
119 if ( $this->reader->nodeType == XMLReader::ELEMENT &&
120 $this->reader->name == $name ) {
123 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
124 $this->reader->name == $parent ) {
125 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
129 return $this->close();
133 * Shouldn't something like this be built-in to XMLReader?
134 * Fetches text contents of the current element, assuming
135 * no sub-elements or such scary things.
140 function nodeContents() {
141 if ( $this->atEnd ) {
144 if ( $this->reader->isEmptyElement ) {
148 while ( $this->reader->read() ) {
149 switch( $this->reader->nodeType ) {
150 case XMLReader::TEXT:
151 // case XMLReader::WHITESPACE:
152 case XMLReader::SIGNIFICANT_WHITESPACE:
153 $buffer .= $this->reader->value;
155 case XMLReader::END_ELEMENT:
159 return $this->close();
166 $this->reader->close();