Followup r77748, throw NEW MWException
[lhc/web/wiklou.git] / maintenance / backupPrefetch.inc
1 <?php
2 /**
3 * Helper class for the --prefetch option of dumpTextPass.php
4 *
5 * @file
6 * @ingroup Maintenance
7 */
8
9 /**
10 * Readahead helper for making large MediaWiki data dumps;
11 * reads in a previous XML dump to sequentially prefetch text
12 * records already normalized and decompressed.
13 *
14 * This can save load on the external database servers, hopefully.
15 *
16 * Assumes that dumps will be recorded in the canonical order:
17 * - ascending by page_id
18 * - ascending by rev_id within each page
19 * - text contents are immutable and should not change once
20 * recorded, so the previous dump is a reliable source
21 *
22 * @ingroup Maintenance
23 */
24 class BaseDump {
25 var $reader = null;
26 var $atEnd = false;
27 var $atPageEnd = false;
28 var $lastPage = 0;
29 var $lastRev = 0;
30
31 function BaseDump( $infile ) {
32 $this->reader = new XMLReader();
33 $this->reader->open( $infile );
34 }
35
36 /**
37 * Attempts to fetch the text of a particular page revision
38 * from the dump stream. May return null if the page is
39 * unavailable.
40 *
41 * @param $page Integer: ID number of page to read
42 * @param $rev Integer: ID number of revision to read
43 * @return string or null
44 */
45 function prefetch( $page, $rev ) {
46 $page = intval( $page );
47 $rev = intval( $rev );
48 while ( $this->lastPage < $page && !$this->atEnd ) {
49 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
50 $this->nextPage();
51 }
52 if ( $this->lastPage > $page || $this->atEnd ) {
53 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
54 return null;
55 }
56 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
57 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
58 $this->nextRev();
59 }
60 if ( $this->lastRev == $rev && !$this->atEnd ) {
61 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
62 return $this->nextText();
63 } else {
64 $this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
65 return null;
66 }
67 }
68
69 function debug( $str ) {
70 wfDebug( $str . "\n" );
71 // global $dumper;
72 // $dumper->progress( $str );
73 }
74
75 /**
76 * @access private
77 */
78 function nextPage() {
79 if ( $this->skipTo( 'page', 'mediawiki' ) ) {
80 if ( $this->skipTo( 'id' ) ) {
81 $this->lastPage = intval( $this->nodeContents() );
82 $this->lastRev = 0;
83 $this->atPageEnd = false;
84 }
85 } else {
86 $this->atEnd = true;
87 }
88 }
89
90 /**
91 * @access private
92 */
93 function nextRev() {
94 if ( $this->skipTo( 'revision' ) ) {
95 if ( $this->skipTo( 'id' ) ) {
96 $this->lastRev = intval( $this->nodeContents() );
97 }
98 } else {
99 $this->atPageEnd = true;
100 }
101 }
102
103 /**
104 * @access private
105 */
106 function nextText() {
107 $this->skipTo( 'text' );
108 return strval( $this->nodeContents() );
109 }
110
111 /**
112 * @access private
113 */
114 function skipTo( $name, $parent = 'page' ) {
115 if ( $this->atEnd ) {
116 return false;
117 }
118 while ( $this->reader->read() ) {
119 if ( $this->reader->nodeType == XMLReader::ELEMENT &&
120 $this->reader->name == $name ) {
121 return true;
122 }
123 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
124 $this->reader->name == $parent ) {
125 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
126 return false;
127 }
128 }
129 return $this->close();
130 }
131
132 /**
133 * Shouldn't something like this be built-in to XMLReader?
134 * Fetches text contents of the current element, assuming
135 * no sub-elements or such scary things.
136 *
137 * @return String
138 * @access private
139 */
140 function nodeContents() {
141 if ( $this->atEnd ) {
142 return null;
143 }
144 if ( $this->reader->isEmptyElement ) {
145 return "";
146 }
147 $buffer = "";
148 while ( $this->reader->read() ) {
149 switch( $this->reader->nodeType ) {
150 case XMLReader::TEXT:
151 // case XMLReader::WHITESPACE:
152 case XMLReader::SIGNIFICANT_WHITESPACE:
153 $buffer .= $this->reader->value;
154 break;
155 case XMLReader::END_ELEMENT:
156 return $buffer;
157 }
158 }
159 return $this->close();
160 }
161
162 /**
163 * @access private
164 */
165 function close() {
166 $this->reader->close();
167 $this->atEnd = true;
168 return null;
169 }
170 }