36eb731bbb22f5b4f328b449895178ab6b576ef5
3 require( dirname( __FILE__
) .'/../commandLine.inc' );
6 if ( count( $args ) < 1 ) {
7 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
8 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
14 function trackBlobs( $clusters ) {
16 trackRevisions( $clusters );
17 trackOrphans( $clusters );
20 function initTrackingTable() {
21 $dbw = wfGetDB( DB_MASTER
);
22 if ( !$dbw->tableExists( 'blob_tracking' ) ) {
23 $dbw->sourceFile( dirname( __FILE__
) . '/blob_tracking.sql' );
27 function getTextClause( $clusters ) {
28 $dbr = wfGetDB( DB_SLAVE
);
30 foreach ( $clusters as $cluster ) {
31 if ( $textClause != '' ) {
32 $textClause .= ' OR ';
34 $textClause .= 'old_text LIKE ' . $dbr->addQuotes( $dbr->escapeLike( "DB://$cluster/" ) . '%' );
39 function interpretPointer( $text ) {
40 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
45 'id' => intval( $m[2] ),
46 'hash' => isset( $m[3] ) ?
$m[2] : null
51 * Scan the revision table for rows stored in the specified clusters
53 function trackRevisions( $clusters ) {
54 $dbw = wfGetDB( DB_MASTER
);
55 $dbr = wfGetDB( DB_SLAVE
);
57 $reportingInterval = 10;
59 $textClause = getTextClause( $clusters );
61 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__
);
65 echo "Finding revisions...\n";
68 $res = $dbr->select( array( 'revision', 'text' ),
69 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
71 'rev_id > ' . $dbr->addQuotes( $startId ),
74 "old_flags LIKE '%external%'",
78 'ORDER BY' => 'rev_id',
82 if ( !$res->numRows() ) {
86 $insertBatch = array();
87 foreach ( $res as $row ) {
88 $startId = $row->rev_id
;
89 $info = interpretPointer( $row->old_text
);
91 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
94 if ( !in_array( $info['cluster'], $clusters ) ) {
95 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
98 $insertBatch[] = array(
99 'bt_page' => $row->rev_page
,
100 'bt_rev_id' => $row->rev_id
,
101 'bt_text_id' => $row->old_id
,
102 'bt_cluster' => $info['cluster'],
103 'bt_blob_id' => $info['id'],
104 'bt_cgz_hash' => $info['hash']
107 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__
);
108 $rowsInserted +
= count( $insertBatch );
111 if ( $batchesDone >= $reportingInterval ) {
113 echo "$startId / $endId\n";
114 wfWaitForSlaves( 5 );
117 echo "Found $rowsInserted revisions\n";
121 * Scan the text table for orphan text
123 function trackOrphans( $clusters ) {
124 # Wait until the blob_tracking table is available in the slave
125 $dbw = wfGetDB( DB_MASTER
);
126 $dbr = wfGetDB( DB_SLAVE
);
127 $pos = $dbw->getMasterPos();
128 $dbr->masterPosWait( $pos, 100000 );
131 $reportingInterval = 10;
133 $textClause = getTextClause( $clusters );
135 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__
);
139 echo "Finding orphan text...\n";
141 # Scan the text table for orphan text
143 $res = $dbr->select( array( 'text', 'blob_tracking' ),
144 array( 'old_id', 'old_flags', 'old_text' ),
146 'old_id>' . $dbr->addQuotes( $startId ),
148 "old_flags LIKE '%external%'",
153 'ORDER BY' => 'old_id',
154 'LIMIT' => $batchSize
156 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
159 foreach ( $res as $row ) {
160 $ids[] = $row->old_id
;
163 if ( !$res->numRows() ) {
167 $insertBatch = array();
168 foreach ( $res as $row ) {
169 $startId = $row->old_id
;
170 $info = interpretPointer( $row->old_text
);
172 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
175 if ( !in_array( $info['cluster'], $clusters ) ) {
176 echo "Invalid cluster returned in SQL query\n";
180 $insertBatch[] = array(
183 'bt_text_id' => $row->old_id
,
184 'bt_cluster' => $info['cluster'],
185 'bt_blob_id' => $info['id'],
186 'bt_cgz_hash' => $info['hash']
189 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__
);
191 $rowsInserted +
= count( $insertBatch );
193 if ( $batchesDone >= $reportingInterval ) {
195 echo "$startId / $endId\n";
196 wfWaitForSlaves( 5 );
199 echo "Found $rowsInserted orphan text rows\n";