Resolve stubs which may be broken by running RCT. Added a --start option. Not tested...
[lhc/web/wiklou.git] / maintenance / storage / fixBug20757.php
1 <?php
2
3 require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
4
5 class FixBug20757 extends Maintenance {
6 var $batchSize = 10000;
7 var $mapCache = array();
8 var $mapCacheSize = 0;
9 var $maxMapCacheSize = 1000000;
10
11 function __construct() {
12 parent::__construct();
13 $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
14 $this->addOption( 'dry-run', 'Report only' );
15 $this->addOption( 'start', 'old_id to start at', false, true );
16 }
17
18 function execute() {
19 $dbr = wfGetDB( DB_SLAVE );
20 $dbw = wfGetDB( DB_MASTER );
21
22 $dryRun = $this->getOption( 'dry-run' );
23 if ( $dryRun ) {
24 print "Dry run only.\n";
25 }
26
27 $startId = $this->getOption( 'start', 0 );
28 $numGood = 0;
29 $numFixed = 0;
30 $numBad = 0;
31
32 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
33
34 while ( true ) {
35 print "ID: $startId / $totalRevs\r";
36
37 $res = $dbr->select(
38 'text',
39 array( 'old_id', 'old_flags', 'old_text' ),
40 array(
41 'old_id > ' . intval( $startId ),
42 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'object', $dbr->anyString )
43 ),
44 __METHOD__,
45 array(
46 'ORDER BY' => 'old_id',
47 'LIMIT' => $this->batchSize,
48 )
49 );
50
51 if ( !$res->numRows() ) {
52 break;
53 }
54
55 $secondaryIds = array();
56 $stubs = array();
57
58 foreach ( $res as $row ) {
59 $startId = $row->old_id;
60
61 // Basic sanity checks
62 $obj = unserialize( $row->old_text );
63 if ( $obj === false ) {
64 print "{$row->old_id}: unrecoverable: cannot unserialize\n";
65 ++$numBad;
66 continue;
67 }
68
69 if ( !is_object( $obj ) ) {
70 print "{$row->old_id}: unrecoverable: unserialized to type " .
71 gettype( $obj ) . ", possible double-serialization\n";
72 ++$numBad;
73 continue;
74 }
75
76 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
77 print "{$row->old_id}: unrecoverable: unexpected object class " .
78 get_class( $obj ) . "\n";
79 ++$numBad;
80 continue;
81 }
82
83 // Queue the stub for future batch processing
84 $id = intval( $obj->mOldId );
85 $secondaryIds[] = $id;
86 $stubs[$row->old_id] = array(
87 'secondaryId' => $id,
88 'hash' => $obj->mHash,
89 );
90 }
91
92 $secondaryIds = array_unique( $secondaryIds );
93
94 if ( !count( $secondaryIds ) ) {
95 continue;
96 }
97
98 // Run the batch query on blob_tracking
99 $res = $dbr->select(
100 'blob_tracking',
101 '*',
102 array(
103 'bt_text_id' => $secondaryIds,
104 'bt_moved' => 1,
105 ),
106 __METHOD__
107 );
108 $trackedBlobs = array();
109 foreach ( $res as $row ) {
110 $trackedBlobs[$row->bt_text_id] = $row;
111 }
112
113 // Process the stubs
114 $stubsToFix = array();
115 foreach ( $stubs as $primaryId => $stub ) {
116 $secondaryId = $stub['secondaryId'];
117 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
118 // No tracked blob. Work out what went wrong
119 $secondaryRow = $dbr->selectRow(
120 'text',
121 array( 'old_flags', 'old_text' ),
122 array( 'old_id' => $secondaryId ),
123 __METHOD__
124 );
125 if ( !$secondaryRow ) {
126 print "$primaryId: unrecoverable: secondary row is missing\n";
127 ++$numBad;
128 } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
129 // Not broken yet, and not in the tracked clusters so it won't get
130 // broken by the current RCT run.
131 ++$numGood;
132 } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
133 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
134 ++$numBad;
135 } else {
136 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
137 ++$numBad;
138 }
139 unset( $stubs[$primaryId] );
140 continue;
141 }
142 $trackRow = $trackedBlobs[$secondaryId];
143
144 // Check that the specified text really is available in the tracked source row
145 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
146 $text = ExternalStore::fetchFromURL( $url );
147 if ( $text === false ) {
148 print "$primaryId: unrecoverable: source text missing\n";
149 ++$numBad;
150 unset( $stubs[$primaryId] );
151 continue;
152 }
153 if ( md5( $text ) !== $stub['hash'] ) {
154 print "$primaryId: unrecoverable: content hashes do not match\n";
155 ++$numBad;
156 unset( $stubs[$primaryId] );
157 continue;
158 }
159
160 // Find the page_id and rev_id
161 // The page is probably the same as the page of the secondary row
162 $pageId = intval( $trackRow->bt_page );
163 if ( !$pageId ) {
164 $revId = $pageId = 0;
165 } else {
166 $revId = $this->findTextIdInPage( $pageId, $primaryId );
167 if ( !$revId ) {
168 // Actually an orphan
169 $pageId = $revId = 0;
170 }
171 }
172
173 if ( !$dryRun ) {
174 // Reset the text row to point to the original copy
175 $dbw->begin();
176 $dbw->update(
177 'text',
178 // SET
179 array(
180 'old_flags' => 'external', // use legacy encoding
181 'old_text' => $url
182 ),
183 // WHERE
184 array( 'old_id' => $primaryId ),
185 __METHOD__
186 );
187
188 // Add a blob_tracking row so that the new reference can be recompressed
189 // without needing to run trackBlobs.php again
190 $dbw->insert( 'blob_tracking',
191 array(
192 'bt_page' => $pageId,
193 'bt_rev_id' => $revId,
194 'bt_text_id' => $primaryId,
195 'bt_cluster' => $trackRow->bt_cluster,
196 'bt_blob_id' => $trackRow->bt_blob_id,
197 'bt_cgz_hash' => $stub['hash'],
198 'bt_new_url' => null,
199 'bt_moved' => 0,
200 ),
201 __METHOD__
202 );
203 $dbw->commit();
204 $this->waitForSlaves();
205 }
206
207 print "$primaryId: resolved to $url\n";
208 ++$numFixed;
209 }
210 }
211
212 print "\n";
213 print "Fixed: $numFixed\n";
214 print "Unrecoverable: $numBad\n";
215 print "Good stubs: $numGood\n";
216 }
217
218 function waitForSlaves() {
219 static $iteration = 0;
220 ++$iteration;
221 if ( ++$iteration > 50 == 0 ) {
222 wfWaitForSlaves( 5 );
223 $iteration = 0;
224 }
225 }
226
227 function findTextIdInPage( $pageId, $textId ) {
228 $ids = $this->getRevTextMap( $pageId );
229 if ( !isset( $ids[$textId] ) ) {
230 return null;
231 } else {
232 return $ids[$textId];
233 }
234 }
235
236 function getRevTextMap( $pageId ) {
237 if ( !isset( $this->mapCache[$pageId] ) ) {
238 // Limit cache size
239 while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
240 $key = key( $this->mapCache );
241 $this->mapCacheSize -= count( $this->mapCache[$key] );
242 unset( $this->mapCache[$key] );
243 }
244
245 $dbr = wfGetDB( DB_SLAVE );
246 $map = array();
247 $res = $dbr->select( 'revision',
248 array( 'rev_id', 'rev_text_id' ),
249 array( 'rev_page' => $pageId ),
250 __METHOD__
251 );
252 foreach ( $res as $row ) {
253 $map[$row->rev_text_id] = $row->rev_id;
254 }
255 $this->mapCache[$pageId] = $map;
256 $this->mapCacheSize += count( $map );
257 }
258 return $this->mapCache[$pageId];
259 }
260
261 /**
262 * This is based on part of HistoryBlobStub::getText().
263 * Determine if the text can be retrieved from the row in the normal way.
264 */
265 function isUnbrokenStub( $stub, $secondaryRow ) {
266 $flags = explode( ',', $secondaryRow->old_flags );
267 if( in_array( 'external', $flags ) ) {
268 $url = $secondaryRow->old_text;
269 @list( /* $proto */ , $path ) = explode( '://', $url, 2 );
270 if ( $path == "" ) {
271 return false;
272 }
273 $secondaryRow->old_text = ExternalStore::fetchFromUrl( $url );
274 }
275 if( !in_array( 'object', $flags ) ) {
276 return false;
277 }
278
279 if( in_array( 'gzip', $flags ) ) {
280 $obj = unserialize( gzinflate( $secondaryRow->old_text ) );
281 } else {
282 $obj = unserialize( $secondaryRow->old_text );
283 }
284
285 if( !is_object( $obj ) ) {
286 // Correct for old double-serialization bug.
287 $obj = unserialize( $obj );
288 }
289
290 if ( !is_object( $obj ) ) {
291 return false;
292 }
293
294 $obj->uncompress();
295 $text = $obj->getItem( $stub['hash'] );
296 return $text !== false;
297 }
298 }
299
300 $maintClass = 'FixBug20757';
301 require_once( DO_MAINTENANCE );
302