36eb731bbb22f5b4f328b449895178ab6b576ef5
[lhc/web/wiklou.git] / maintenance / storage / trackBlobs.php
1 <?php
2
3 require( dirname( __FILE__ ) .'/../commandLine.inc' );
4
5
6 if ( count( $args ) < 1 ) {
7 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
8 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
9 exit( 1 );
10 }
11
12 trackBlobs( $args );
13
14 function trackBlobs( $clusters ) {
15 initTrackingTable();
16 trackRevisions( $clusters );
17 trackOrphans( $clusters );
18 }
19
20 function initTrackingTable() {
21 $dbw = wfGetDB( DB_MASTER );
22 if ( !$dbw->tableExists( 'blob_tracking' ) ) {
23 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
24 }
25 }
26
27 function getTextClause( $clusters ) {
28 $dbr = wfGetDB( DB_SLAVE );
29 $textClause = '';
30 foreach ( $clusters as $cluster ) {
31 if ( $textClause != '' ) {
32 $textClause .= ' OR ';
33 }
34 $textClause .= 'old_text LIKE ' . $dbr->addQuotes( $dbr->escapeLike( "DB://$cluster/" ) . '%' );
35 }
36 return $textClause;
37 }
38
39 function interpretPointer( $text ) {
40 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
41 return false;
42 }
43 return array(
44 'cluster' => $m[1],
45 'id' => intval( $m[2] ),
46 'hash' => isset( $m[3] ) ? $m[2] : null
47 );
48 }
49
50 /**
51 * Scan the revision table for rows stored in the specified clusters
52 */
53 function trackRevisions( $clusters ) {
54 $dbw = wfGetDB( DB_MASTER );
55 $dbr = wfGetDB( DB_SLAVE );
56 $batchSize = 10;
57 $reportingInterval = 10;
58
59 $textClause = getTextClause( $clusters );
60 $startId = 0;
61 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
62 $batchesDone = 0;
63 $rowsInserted = 0;
64
65 echo "Finding revisions...\n";
66
67 while ( true ) {
68 $res = $dbr->select( array( 'revision', 'text' ),
69 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
70 array(
71 'rev_id > ' . $dbr->addQuotes( $startId ),
72 'rev_text_id=old_id',
73 $textClause,
74 "old_flags LIKE '%external%'",
75 ),
76 __METHOD__,
77 array(
78 'ORDER BY' => 'rev_id',
79 'LIMIT' => $batchSize
80 )
81 );
82 if ( !$res->numRows() ) {
83 break;
84 }
85
86 $insertBatch = array();
87 foreach ( $res as $row ) {
88 $startId = $row->rev_id;
89 $info = interpretPointer( $row->old_text );
90 if ( !$info ) {
91 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
92 continue;
93 }
94 if ( !in_array( $info['cluster'], $clusters ) ) {
95 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
96 continue;
97 }
98 $insertBatch[] = array(
99 'bt_page' => $row->rev_page,
100 'bt_rev_id' => $row->rev_id,
101 'bt_text_id' => $row->old_id,
102 'bt_cluster' => $info['cluster'],
103 'bt_blob_id' => $info['id'],
104 'bt_cgz_hash' => $info['hash']
105 );
106 }
107 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
108 $rowsInserted += count( $insertBatch );
109
110 ++$batchesDone;
111 if ( $batchesDone >= $reportingInterval ) {
112 $batchesDone = 0;
113 echo "$startId / $endId\n";
114 wfWaitForSlaves( 5 );
115 }
116 }
117 echo "Found $rowsInserted revisions\n";
118 }
119
120 /**
121 * Scan the text table for orphan text
122 */
123 function trackOrphans( $clusters ) {
124 # Wait until the blob_tracking table is available in the slave
125 $dbw = wfGetDB( DB_MASTER );
126 $dbr = wfGetDB( DB_SLAVE );
127 $pos = $dbw->getMasterPos();
128 $dbr->masterPosWait( $pos, 100000 );
129
130 $batchSize = 10;
131 $reportingInterval = 10;
132
133 $textClause = getTextClause( $clusters );
134 $startId = 0;
135 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
136 $rowsInserted = 0;
137 $batchesDone = 0;
138
139 echo "Finding orphan text...\n";
140
141 # Scan the text table for orphan text
142 while ( true ) {
143 $res = $dbr->select( array( 'text', 'blob_tracking' ),
144 array( 'old_id', 'old_flags', 'old_text' ),
145 array(
146 'old_id>' . $dbr->addQuotes( $startId ),
147 $textClause,
148 "old_flags LIKE '%external%'",
149 'bt_text_id IS NULL'
150 ),
151 __METHOD__,
152 array(
153 'ORDER BY' => 'old_id',
154 'LIMIT' => $batchSize
155 ),
156 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
157 );
158 $ids = array();
159 foreach ( $res as $row ) {
160 $ids[] = $row->old_id;
161 }
162
163 if ( !$res->numRows() ) {
164 break;
165 }
166
167 $insertBatch = array();
168 foreach ( $res as $row ) {
169 $startId = $row->old_id;
170 $info = interpretPointer( $row->old_text );
171 if ( !$info ) {
172 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
173 continue;
174 }
175 if ( !in_array( $info['cluster'], $clusters ) ) {
176 echo "Invalid cluster returned in SQL query\n";
177 continue;
178 }
179
180 $insertBatch[] = array(
181 'bt_page' => 0,
182 'bt_rev_id' => 0,
183 'bt_text_id' => $row->old_id,
184 'bt_cluster' => $info['cluster'],
185 'bt_blob_id' => $info['id'],
186 'bt_cgz_hash' => $info['hash']
187 );
188 }
189 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
190
191 $rowsInserted += count( $insertBatch );
192 ++$batchesDone;
193 if ( $batchesDone >= $reportingInterval ) {
194 $batchesDone = 0;
195 echo "$startId / $endId\n";
196 wfWaitForSlaves( 5 );
197 }
198 }
199 echo "Found $rowsInserted orphan text rows\n";
200 }