Don't explode in dump text prefetch when we fall off the end of the file :D
[lhc/web/wiklou.git] / maintenance / backupPrefetch.inc
1 <?php
2
3 /**
4 * Readahead helper for making large MediaWiki data dumps;
5 * reads in a previous XML dump to sequentially prefetch text
6 * records already normalized and decompressed.
7 *
8 * This can save load on the external database servers, hopefully.
9 *
10 * Assumes that dumps will be recorded in the canonical order:
11 * - ascending by page_id
12 * - ascending by rev_id within each page
13 * - text contents are immutable and should not change once
14 * recorded, so the previous dump is a reliable source
15 *
16 * Requires PHP 5 and the XMLReader PECL extension.
17 */
18 class BaseDump {
19 var $reader = null;
20 var $atEnd = false;
21 var $lastPage = 0;
22 var $lastRev = 0;
23
24 function BaseDump( $infile ) {
25 $this->reader = new XMLReader();
26 $this->reader->open( $infile );
27 }
28
29 /**
30 * Attempts to fetch the text of a particular page revision
31 * from the dump stream. May return null if the page is
32 * unavailable.
33 *
34 * @param int $page ID number of page to read
35 * @param int $rev ID number of revision to read
36 * @return string or null
37 */
38 function prefetch( $page, $rev ) {
39 while( $this->lastPage < $page && !$this->atEnd ) {
40 $this->nextPage();
41 }
42 if( $this->lastPage > $page || $this->atEnd ) {
43 $this->debug( "BaseDump::prefetch already past page $page looking for rev $rev\n" );
44 return null;
45 }
46 while( $this->lastRev < $rev && !$this->atEnd ) {
47 $this->nextRev();
48 }
49 if( $this->lastRev == $rev ) {
50 $this->debug( "BaseDump::prefetch hit on $page, $rev\n" );
51 return $this->nextText();
52 } else {
53 $this->debug( "BaseDump::prefetch already past rev $rev on page $page\n" );
54 return null;
55 }
56 }
57
58 function debug( $str ) {
59 wfDebug( $str );
60 //global $dumper;
61 //$dumper->progress( $str );
62 }
63
64 /**
65 * @access private
66 */
67 function nextPage() {
68 $this->skipTo( 'page' );
69 $this->skipTo( 'id' );
70 $this->lastPage = intval( $this->nodeContents() );
71 $this->lastRev = 0;
72 }
73
74 /**
75 * @access private
76 */
77 function nextRev() {
78 $this->skipTo( 'revision' );
79 $this->skipTo( 'id' );
80 $this->lastRev = intval( $this->nodeContents() );
81 }
82
83 /**
84 * @access private
85 */
86 function nextText() {
87 $this->skipTo( 'text' );
88 return strval( $this->nodeContents() );
89 }
90
91 /**
92 * @access private
93 */
94 function skipTo( $name ) {
95 if( $this->atEnd ) {
96 return false;
97 }
98 while( $this->reader->read() ) {
99 if( $this->reader->nodeType == XMLREADER_ELEMENT &&
100 $this->reader->name == $name ) {
101 return true;
102 }
103 }
104 return $this->close();
105 }
106
107 /**
108 * Shouldn't something like this be built-in to XMLReader?
109 * Fetches text contents of the current element, assuming
110 * no sub-elements or such scary things.
111 * @return string
112 * @access private
113 */
114 function nodeContents() {
115 if( $this->atEnd ) {
116 return false;
117 }
118 if( $this->reader->isEmptyElement ) {
119 return "";
120 }
121 $buffer = "";
122 while( $this->reader->read() ) {
123 switch( $this->reader->nodeType ) {
124 case XMLREADER_TEXT:
125 // case XMLREADER_WHITESPACE:
126 case XMLREADER_SIGNIFICANT_WHITESPACE:
127 $buffer .= $this->reader->value;
128 break;
129 case XMLREADER_END_ELEMENT:
130 return $buffer;
131 }
132 }
133 return $this->close();
134 }
135
136 /**
137 * @access private
138 */
139 function close() {
140 $this->reader->close();
141 $this->atEnd = true;
142 return false;
143 }
144 }
145
146 ?>