3 * Adds blobs from a given external storage cluster to the blob_tracking table.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
21 * @ingroup Maintenance
22 * @see wfWaitForSlaves()
25 use MediaWiki\MediaWikiServices
;
26 use MediaWiki\Revision\SlotRecord
;
27 use Wikimedia\Rdbms\DBConnectionError
;
29 require __DIR__
. '/../commandLine.inc';
31 if ( count( $args ) < 1 ) {
32 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
33 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
34 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
38 $tracker = new TrackBlobs( $args );
43 public $clusters, $textClause;
44 public $doBlobOrphans;
45 public $trackedBlobs = [];
47 public $batchSize = 1000;
48 public $reportingInterval = 10;
50 function __construct( $clusters ) {
51 $this->clusters
= $clusters;
52 if ( extension_loaded( 'gmp' ) ) {
53 $this->doBlobOrphans
= true;
54 foreach ( $clusters as $cluster ) {
55 $this->trackedBlobs
[$cluster] = gmp_init( 0 );
58 echo "Warning: the gmp extension is needed to find orphan blobs\n";
63 $this->checkIntegrity();
64 $this->initTrackingTable();
65 $this->trackRevisions();
66 $this->trackOrphanText();
67 if ( $this->doBlobOrphans
) {
68 $this->findOrphanBlobs();
72 function checkIntegrity() {
73 echo "Doing integrity check...\n";
74 $dbr = wfGetDB( DB_REPLICA
);
76 // Scan for HistoryBlobStub objects in the text table (T22757)
78 $exists = $dbr->selectField( 'text', 1,
79 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
80 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
85 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
86 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
91 echo "Integrity check OK\n";
94 function initTrackingTable() {
95 $dbw = wfGetDB( DB_MASTER
);
96 if ( $dbw->tableExists( 'blob_tracking' ) ) {
97 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
98 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
100 $dbw->sourceFile( __DIR__
. '/blob_tracking.sql' );
103 function getTextClause() {
104 if ( !$this->textClause
) {
105 $dbr = wfGetDB( DB_REPLICA
);
106 $this->textClause
= '';
107 foreach ( $this->clusters
as $cluster ) {
108 if ( $this->textClause
!= '' ) {
109 $this->textClause
.= ' OR ';
111 $this->textClause
.= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
115 return $this->textClause
;
118 function interpretPointer( $text ) {
119 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
125 'id' => intval( $m[2] ),
126 'hash' => $m[3] ??
null
131 * Scan the revision table for rows stored in the specified clusters
133 function trackRevisions() {
134 global $wgMultiContentRevisionSchemaMigrationStage;
136 $dbw = wfGetDB( DB_MASTER
);
137 $dbr = wfGetDB( DB_REPLICA
);
139 $textClause = $this->getTextClause();
141 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', '', __METHOD__
);
145 echo "Finding revisions...\n";
147 $fields = [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ];
149 'ORDER BY' => 'rev_id',
150 'LIMIT' => $this->batchSize
154 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
156 if ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_READ_OLD
) {
157 $tables = [ 'revision', 'text' ];
158 $conds = array_merge( [
159 'rev_text_id=old_id',
162 $slotRoleStore = MediaWikiServices
::getInstance()->getSlotRoleStore();
163 $tables = [ 'revision', 'slots', 'content', 'text' ];
164 $conds = array_merge( [
165 'rev_id=slot_revision_id',
166 'slot_role_id=' . $slotRoleStore->getId( SlotRecord
::MAIN
),
167 'content_id=slot_content_id',
168 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ),
169 'SUBSTRING(content_address, 4)=old_id',
174 $res = $dbr->select( $tables,
177 'rev_id > ' . $dbr->addQuotes( $startId ),
182 if ( !$res->numRows() ) {
187 foreach ( $res as $row ) {
188 $startId = $row->rev_id
;
189 $info = $this->interpretPointer( $row->old_text
);
191 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
194 if ( !in_array( $info['cluster'], $this->clusters
) ) {
195 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
199 'bt_page' => $row->rev_page
,
200 'bt_rev_id' => $row->rev_id
,
201 'bt_text_id' => $row->old_id
,
202 'bt_cluster' => $info['cluster'],
203 'bt_blob_id' => $info['id'],
204 'bt_cgz_hash' => $info['hash']
206 if ( $this->doBlobOrphans
) {
207 gmp_setbit( $this->trackedBlobs
[$info['cluster']], $info['id'] );
210 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__
);
211 $rowsInserted +
= count( $insertBatch );
214 if ( $batchesDone >= $this->reportingInterval
) {
216 echo "$startId / $endId\n";
220 echo "Found $rowsInserted revisions\n";
224 * Scan the text table for orphan text
225 * Orphan text here does not imply DB corruption -- deleted text tracked by the
226 * archive table counts as orphan for our purposes.
228 function trackOrphanText() {
229 # Wait until the blob_tracking table is available in the replica DB
230 $dbw = wfGetDB( DB_MASTER
);
231 $dbr = wfGetDB( DB_REPLICA
);
232 $pos = $dbw->getMasterPos();
233 $dbr->masterPosWait( $pos, 100000 );
235 $textClause = $this->getTextClause( $this->clusters
);
237 $endId = $dbr->selectField( 'text', 'MAX(old_id)', '', __METHOD__
);
241 echo "Finding orphan text...\n";
243 # Scan the text table for orphan text
245 $res = $dbr->select( [ 'text', 'blob_tracking' ],
246 [ 'old_id', 'old_flags', 'old_text' ],
248 'old_id>' . $dbr->addQuotes( $startId ),
250 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
255 'ORDER BY' => 'old_id',
256 'LIMIT' => $this->batchSize
258 [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
261 foreach ( $res as $row ) {
262 $ids[] = $row->old_id
;
265 if ( !$res->numRows() ) {
270 foreach ( $res as $row ) {
271 $startId = $row->old_id
;
272 $info = $this->interpretPointer( $row->old_text
);
274 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
277 if ( !in_array( $info['cluster'], $this->clusters
) ) {
278 echo "Invalid cluster returned in SQL query\n";
285 'bt_text_id' => $row->old_id
,
286 'bt_cluster' => $info['cluster'],
287 'bt_blob_id' => $info['id'],
288 'bt_cgz_hash' => $info['hash']
290 if ( $this->doBlobOrphans
) {
291 gmp_setbit( $this->trackedBlobs
[$info['cluster']], $info['id'] );
294 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__
);
296 $rowsInserted +
= count( $insertBatch );
298 if ( $batchesDone >= $this->reportingInterval
) {
300 echo "$startId / $endId\n";
304 echo "Found $rowsInserted orphan text rows\n";
308 * Scan the blobs table for rows not registered in blob_tracking (and thus not
309 * registered in the text table).
311 * Orphan blobs are indicative of DB corruption. They are inaccessible and
312 * should probably be deleted.
314 function findOrphanBlobs() {
315 if ( !extension_loaded( 'gmp' ) ) {
316 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
321 $dbw = wfGetDB( DB_MASTER
);
323 foreach ( $this->clusters
as $cluster ) {
324 echo "Searching for orphan blobs in $cluster...\n";
325 $lbFactory = MediaWikiServices
::getInstance()->getDBLoadBalancerFactory();
326 $lb = $lbFactory->getExternalLB( $cluster );
328 $extDB = $lb->getConnection( DB_REPLICA
);
329 } catch ( DBConnectionError
$e ) {
330 if ( strpos( $e->error
, 'Unknown database' ) !== false ) {
331 echo "No database on $cluster\n";
333 echo "Error on $cluster: " . $e->getMessage() . "\n";
337 $table = $extDB->getLBInfo( 'blobs table' );
338 if ( is_null( $table ) ) {
341 if ( !$extDB->tableExists( $table ) ) {
342 echo "No blobs table on cluster $cluster\n";
347 $actualBlobs = gmp_init( 0 );
348 $endId = $extDB->selectField( $table, 'MAX(blob_id)', '', __METHOD__
);
350 // Build a bitmap of actual blob rows
352 $res = $extDB->select( $table,
354 [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
356 [ 'LIMIT' => $this->batchSize
, 'ORDER BY' => 'blob_id' ]
359 if ( !$res->numRows() ) {
363 foreach ( $res as $row ) {
364 gmp_setbit( $actualBlobs, $row->blob_id
);
366 $startId = $row->blob_id
;
369 if ( $batchesDone >= $this->reportingInterval
) {
371 echo "$startId / $endId\n";
375 // Find actual blobs that weren't tracked by the previous passes
376 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
377 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs
[$cluster] ) );
379 // Traverse the orphan list
384 $id = gmp_scan1( $orphans, $id );
389 'bo_cluster' => $cluster,
392 if ( count( $insertBatch ) > $this->batchSize
) {
393 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__
);
400 if ( $insertBatch ) {
401 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__
);
403 echo "Found $numOrphans orphan(s) in $cluster\n";