Whitespace fixup under tha maint directory.
[lhc/web/wiklou.git] / maintenance / storage / trackBlobs.php
1 <?php
2
3 require( dirname( __FILE__ ) . '/../commandLine.inc' );
4
5
6 if ( count( $args ) < 1 ) {
7 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
8 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
9 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
10
11 exit( 1 );
12 }
13 $tracker = new TrackBlobs( $args );
14 $tracker->run();
15 echo "All done.\n";
16
17 class TrackBlobs {
18 var $clusters, $textClause;
19 var $doBlobOrphans;
20 var $trackedBlobs = array();
21
22 var $batchSize = 1000;
23 var $reportingInterval = 10;
24
25 function __construct( $clusters ) {
26 $this->clusters = $clusters;
27 if ( extension_loaded( 'gmp' ) ) {
28 $this->doBlobOrphans = true;
29 foreach ( $clusters as $cluster ) {
30 $this->trackedBlobs[$cluster] = gmp_init( 0 );
31 }
32 } else {
33 echo "Warning: the gmp extension is needed to find orphan blobs\n";
34 }
35 }
36
37 function run() {
38 $this->checkIntegrity();
39 $this->initTrackingTable();
40 $this->trackRevisions();
41 $this->trackOrphanText();
42 if ( $this->doBlobOrphans ) {
43 $this->findOrphanBlobs();
44 }
45 }
46
47 function checkIntegrity() {
48 echo "Doing integrity check...\n";
49 $dbr = wfGetDB( DB_SLAVE );
50
51 // Scan for HistoryBlobStub objects in the text table (bug 20757)
52
53 $exists = $dbr->selectField( 'text', 1,
54 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
55 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
56 __METHOD__
57 );
58
59 if ( $exists ) {
60 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
61 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
62 "to fix this.\n";
63 exit( 1 );
64 }
65
66 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
67 $flags = $dbr->selectField( 'archive', 'ar_flags',
68 'ar_flags LIKE \'%external%\' OR (' .
69 'ar_flags LIKE \'%object%\' ' .
70 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
71 __METHOD__
72 );
73
74 if ( strpos( $flags, 'external' ) !== false ) {
75 echo "Integrity check failed: found external storage pointers in your archive table.\n" .
76 "Run normaliseArchiveTable.php to fix this.\n";
77 exit( 1 );
78 } elseif ( $flags ) {
79 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
80 "These objects are probably already broken, continuing would make them\n" .
81 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
82 exit( 1 );
83 }
84
85 echo "Integrity check OK\n";
86 }
87
88 function initTrackingTable() {
89 $dbw = wfGetDB( DB_MASTER );
90 if ( $dbw->tableExists( 'blob_tracking' ) ) {
91 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
92 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
93 }
94 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
95 }
96
97 function getTextClause() {
98 if ( !$this->textClause ) {
99 $dbr = wfGetDB( DB_SLAVE );
100 $this->textClause = '';
101 foreach ( $this->clusters as $cluster ) {
102 if ( $this->textClause != '' ) {
103 $this->textClause .= ' OR ';
104 }
105 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
106 }
107 }
108 return $this->textClause;
109 }
110
111 function interpretPointer( $text ) {
112 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
113 return false;
114 }
115 return array(
116 'cluster' => $m[1],
117 'id' => intval( $m[2] ),
118 'hash' => isset( $m[3] ) ? $m[3] : null
119 );
120 }
121
122 /**
123 * Scan the revision table for rows stored in the specified clusters
124 */
125 function trackRevisions() {
126 $dbw = wfGetDB( DB_MASTER );
127 $dbr = wfGetDB( DB_SLAVE );
128
129 $textClause = $this->getTextClause();
130 $startId = 0;
131 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
132 $batchesDone = 0;
133 $rowsInserted = 0;
134
135 echo "Finding revisions...\n";
136
137 while ( true ) {
138 $res = $dbr->select( array( 'revision', 'text' ),
139 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
140 array(
141 'rev_id > ' . $dbr->addQuotes( $startId ),
142 'rev_text_id=old_id',
143 $textClause,
144 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
145 ),
146 __METHOD__,
147 array(
148 'ORDER BY' => 'rev_id',
149 'LIMIT' => $this->batchSize
150 )
151 );
152 if ( !$res->numRows() ) {
153 break;
154 }
155
156 $insertBatch = array();
157 foreach ( $res as $row ) {
158 $startId = $row->rev_id;
159 $info = $this->interpretPointer( $row->old_text );
160 if ( !$info ) {
161 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
162 continue;
163 }
164 if ( !in_array( $info['cluster'], $this->clusters ) ) {
165 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
166 continue;
167 }
168 $insertBatch[] = array(
169 'bt_page' => $row->rev_page,
170 'bt_rev_id' => $row->rev_id,
171 'bt_text_id' => $row->old_id,
172 'bt_cluster' => $info['cluster'],
173 'bt_blob_id' => $info['id'],
174 'bt_cgz_hash' => $info['hash']
175 );
176 if ( $this->doBlobOrphans ) {
177 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
178 }
179 }
180 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
181 $rowsInserted += count( $insertBatch );
182
183 ++$batchesDone;
184 if ( $batchesDone >= $this->reportingInterval ) {
185 $batchesDone = 0;
186 echo "$startId / $endId\n";
187 wfWaitForSlaves( 5 );
188 }
189 }
190 echo "Found $rowsInserted revisions\n";
191 }
192
193 /**
194 * Scan the text table for orphan text
195 * Orphan text here does not imply DB corruption -- deleted text tracked by the
196 * archive table counts as orphan for our purposes.
197 */
198 function trackOrphanText() {
199 # Wait until the blob_tracking table is available in the slave
200 $dbw = wfGetDB( DB_MASTER );
201 $dbr = wfGetDB( DB_SLAVE );
202 $pos = $dbw->getMasterPos();
203 $dbr->masterPosWait( $pos, 100000 );
204
205 $textClause = $this->getTextClause( $this->clusters );
206 $startId = 0;
207 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
208 $rowsInserted = 0;
209 $batchesDone = 0;
210
211 echo "Finding orphan text...\n";
212
213 # Scan the text table for orphan text
214 while ( true ) {
215 $res = $dbr->select( array( 'text', 'blob_tracking' ),
216 array( 'old_id', 'old_flags', 'old_text' ),
217 array(
218 'old_id>' . $dbr->addQuotes( $startId ),
219 $textClause,
220 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
221 'bt_text_id IS NULL'
222 ),
223 __METHOD__,
224 array(
225 'ORDER BY' => 'old_id',
226 'LIMIT' => $this->batchSize
227 ),
228 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
229 );
230 $ids = array();
231 foreach ( $res as $row ) {
232 $ids[] = $row->old_id;
233 }
234
235 if ( !$res->numRows() ) {
236 break;
237 }
238
239 $insertBatch = array();
240 foreach ( $res as $row ) {
241 $startId = $row->old_id;
242 $info = $this->interpretPointer( $row->old_text );
243 if ( !$info ) {
244 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
245 continue;
246 }
247 if ( !in_array( $info['cluster'], $this->clusters ) ) {
248 echo "Invalid cluster returned in SQL query\n";
249 continue;
250 }
251
252 $insertBatch[] = array(
253 'bt_page' => 0,
254 'bt_rev_id' => 0,
255 'bt_text_id' => $row->old_id,
256 'bt_cluster' => $info['cluster'],
257 'bt_blob_id' => $info['id'],
258 'bt_cgz_hash' => $info['hash']
259 );
260 if ( $this->doBlobOrphans ) {
261 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
262 }
263 }
264 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
265
266 $rowsInserted += count( $insertBatch );
267 ++$batchesDone;
268 if ( $batchesDone >= $this->reportingInterval ) {
269 $batchesDone = 0;
270 echo "$startId / $endId\n";
271 wfWaitForSlaves( 5 );
272 }
273 }
274 echo "Found $rowsInserted orphan text rows\n";
275 }
276
277 /**
278 * Scan the blobs table for rows not registered in blob_tracking (and thus not
279 * registered in the text table).
280 *
281 * Orphan blobs are indicative of DB corruption. They are inaccessible and
282 * should probably be deleted.
283 */
284 function findOrphanBlobs() {
285 if ( !extension_loaded( 'gmp' ) ) {
286 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
287 return;
288 }
289
290 $dbw = wfGetDB( DB_MASTER );
291
292 foreach ( $this->clusters as $cluster ) {
293 echo "Searching for orphan blobs in $cluster...\n";
294 $lb = wfGetLBFactory()->getExternalLB( $cluster );
295 try {
296 $extDB = $lb->getConnection( DB_SLAVE );
297 } catch ( DBConnectionError $e ) {
298 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
299 echo "No database on $cluster\n";
300 } else {
301 echo "Error on $cluster: " . $e->getMessage() . "\n";
302 }
303 continue;
304 }
305 $table = $extDB->getLBInfo( 'blobs table' );
306 if ( is_null( $table ) ) {
307 $table = 'blobs';
308 }
309 if ( !$extDB->tableExists( $table ) ) {
310 echo "No blobs table on cluster $cluster\n";
311 continue;
312 }
313 $startId = 0;
314 $batchesDone = 0;
315 $actualBlobs = gmp_init( 0 );
316 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
317
318 // Build a bitmap of actual blob rows
319 while ( true ) {
320 $res = $extDB->select( $table,
321 array( 'blob_id' ),
322 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
323 __METHOD__,
324 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
325 );
326
327 if ( !$res->numRows() ) {
328 break;
329 }
330
331 foreach ( $res as $row ) {
332 gmp_setbit( $actualBlobs, $row->blob_id );
333 }
334 $startId = $row->blob_id;
335
336 ++$batchesDone;
337 if ( $batchesDone >= $this->reportingInterval ) {
338 $batchesDone = 0;
339 echo "$startId / $endId\n";
340 }
341 }
342
343 // Find actual blobs that weren't tracked by the previous passes
344 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
345 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
346
347 // Traverse the orphan list
348 $insertBatch = array();
349 $id = 0;
350 $numOrphans = 0;
351 while ( true ) {
352 $id = gmp_scan1( $orphans, $id );
353 if ( $id == -1 ) {
354 break;
355 }
356 $insertBatch[] = array(
357 'bo_cluster' => $cluster,
358 'bo_blob_id' => $id
359 );
360 if ( count( $insertBatch ) > $this->batchSize ) {
361 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
362 $insertBatch = array();
363 }
364
365 ++$id;
366 ++$numOrphans;
367 }
368 if ( $insertBatch ) {
369 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
370 }
371 echo "Found $numOrphans orphan(s) in $cluster\n";
372 }
373 }
374 }