Stylize maintenance folder..
[lhc/web/wiklou.git] / maintenance / storage / fixBug20757.php
1 <?php
2
3 require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
4
5 class FixBug20757 extends Maintenance {
6 var $batchSize = 10000;
7 var $mapCache = array();
8 var $mapCacheSize = 0;
9 var $maxMapCacheSize = 1000000;
10
11 function __construct() {
12 parent::__construct();
13 $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
14 $this->addOption( 'dry-run', 'Report only' );
15 $this->addOption( 'start', 'old_id to start at', false, true );
16 }
17
18 function execute() {
19 $dbr = wfGetDB( DB_SLAVE );
20 $dbw = wfGetDB( DB_MASTER );
21
22 $dryRun = $this->getOption( 'dry-run' );
23 if ( $dryRun ) {
24 print "Dry run only.\n";
25 }
26
27 $startId = $this->getOption( 'start', 0 );
28 $numGood = 0;
29 $numFixed = 0;
30 $numBad = 0;
31
32 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
33
34 if ( $dbr->getType() == 'mysql'
35 && version_compare( $dbr->getServerVersion(), '4.1.0', '>=' ) )
36 {
37 // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
38 $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
39 } else {
40 // No CONVERT() in MySQL 4.0
41 $lowerLeft = 'LOWER(LEFT(old_text,22))';
42 }
43
44 while ( true ) {
45 print "ID: $startId / $totalRevs\r";
46
47 $res = $dbr->select(
48 'text',
49 array( 'old_id', 'old_flags', 'old_text' ),
50 array(
51 'old_id > ' . intval( $startId ),
52 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
53 "$lowerLeft = 'o:15:\"historyblobstub\"'",
54 ),
55 __METHOD__,
56 array(
57 'ORDER BY' => 'old_id',
58 'LIMIT' => $this->batchSize,
59 )
60 );
61
62 if ( !$res->numRows() ) {
63 break;
64 }
65
66 $secondaryIds = array();
67 $stubs = array();
68
69 foreach ( $res as $row ) {
70 $startId = $row->old_id;
71
72 // Basic sanity checks
73 $obj = unserialize( $row->old_text );
74 if ( $obj === false ) {
75 print "{$row->old_id}: unrecoverable: cannot unserialize\n";
76 ++$numBad;
77 continue;
78 }
79
80 if ( !is_object( $obj ) ) {
81 print "{$row->old_id}: unrecoverable: unserialized to type " .
82 gettype( $obj ) . ", possible double-serialization\n";
83 ++$numBad;
84 continue;
85 }
86
87 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
88 print "{$row->old_id}: unrecoverable: unexpected object class " .
89 get_class( $obj ) . "\n";
90 ++$numBad;
91 continue;
92 }
93
94 // Process flags
95 $flags = explode( ',', $row->old_flags );
96 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
97 $legacyEncoding = false;
98 } else {
99 $legacyEncoding = true;
100 }
101
102 // Queue the stub for future batch processing
103 $id = intval( $obj->mOldId );
104 $secondaryIds[] = $id;
105 $stubs[$row->old_id] = array(
106 'legacyEncoding' => $legacyEncoding,
107 'secondaryId' => $id,
108 'hash' => $obj->mHash,
109 );
110 }
111
112 $secondaryIds = array_unique( $secondaryIds );
113
114 if ( !count( $secondaryIds ) ) {
115 continue;
116 }
117
118 // Run the batch query on blob_tracking
119 $res = $dbr->select(
120 'blob_tracking',
121 '*',
122 array(
123 'bt_text_id' => $secondaryIds,
124 ),
125 __METHOD__
126 );
127 $trackedBlobs = array();
128 foreach ( $res as $row ) {
129 $trackedBlobs[$row->bt_text_id] = $row;
130 }
131
132 // Process the stubs
133 $stubsToFix = array();
134 foreach ( $stubs as $primaryId => $stub ) {
135 $secondaryId = $stub['secondaryId'];
136 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
137 // No tracked blob. Work out what went wrong
138 $secondaryRow = $dbr->selectRow(
139 'text',
140 array( 'old_flags', 'old_text' ),
141 array( 'old_id' => $secondaryId ),
142 __METHOD__
143 );
144 if ( !$secondaryRow ) {
145 print "$primaryId: unrecoverable: secondary row is missing\n";
146 ++$numBad;
147 } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
148 // Not broken yet, and not in the tracked clusters so it won't get
149 // broken by the current RCT run.
150 ++$numGood;
151 } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
152 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
153 ++$numBad;
154 } else {
155 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
156 ++$numBad;
157 }
158 unset( $stubs[$primaryId] );
159 continue;
160 }
161 $trackRow = $trackedBlobs[$secondaryId];
162
163 // Check that the specified text really is available in the tracked source row
164 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
165 $text = ExternalStore::fetchFromURL( $url );
166 if ( $text === false ) {
167 print "$primaryId: unrecoverable: source text missing\n";
168 ++$numBad;
169 unset( $stubs[$primaryId] );
170 continue;
171 }
172 if ( md5( $text ) !== $stub['hash'] ) {
173 print "$primaryId: unrecoverable: content hashes do not match\n";
174 ++$numBad;
175 unset( $stubs[$primaryId] );
176 continue;
177 }
178
179 // Find the page_id and rev_id
180 // The page is probably the same as the page of the secondary row
181 $pageId = intval( $trackRow->bt_page );
182 if ( !$pageId ) {
183 $revId = $pageId = 0;
184 } else {
185 $revId = $this->findTextIdInPage( $pageId, $primaryId );
186 if ( !$revId ) {
187 // Actually an orphan
188 $pageId = $revId = 0;
189 }
190 }
191
192 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
193
194 if ( !$dryRun ) {
195 // Reset the text row to point to the original copy
196 $dbw->begin();
197 $dbw->update(
198 'text',
199 // SET
200 array(
201 'old_flags' => $newFlags,
202 'old_text' => $url
203 ),
204 // WHERE
205 array( 'old_id' => $primaryId ),
206 __METHOD__
207 );
208
209 // Add a blob_tracking row so that the new reference can be recompressed
210 // without needing to run trackBlobs.php again
211 $dbw->insert( 'blob_tracking',
212 array(
213 'bt_page' => $pageId,
214 'bt_rev_id' => $revId,
215 'bt_text_id' => $primaryId,
216 'bt_cluster' => $trackRow->bt_cluster,
217 'bt_blob_id' => $trackRow->bt_blob_id,
218 'bt_cgz_hash' => $stub['hash'],
219 'bt_new_url' => null,
220 'bt_moved' => 0,
221 ),
222 __METHOD__
223 );
224 $dbw->commit();
225 $this->waitForSlaves();
226 }
227
228 print "$primaryId: resolved to $url\n";
229 ++$numFixed;
230 }
231 }
232
233 print "\n";
234 print "Fixed: $numFixed\n";
235 print "Unrecoverable: $numBad\n";
236 print "Good stubs: $numGood\n";
237 }
238
239 function waitForSlaves() {
240 static $iteration = 0;
241 ++$iteration;
242 if ( ++$iteration > 50 == 0 ) {
243 wfWaitForSlaves( 5 );
244 $iteration = 0;
245 }
246 }
247
248 function findTextIdInPage( $pageId, $textId ) {
249 $ids = $this->getRevTextMap( $pageId );
250 if ( !isset( $ids[$textId] ) ) {
251 return null;
252 } else {
253 return $ids[$textId];
254 }
255 }
256
257 function getRevTextMap( $pageId ) {
258 if ( !isset( $this->mapCache[$pageId] ) ) {
259 // Limit cache size
260 while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
261 $key = key( $this->mapCache );
262 $this->mapCacheSize -= count( $this->mapCache[$key] );
263 unset( $this->mapCache[$key] );
264 }
265
266 $dbr = wfGetDB( DB_SLAVE );
267 $map = array();
268 $res = $dbr->select( 'revision',
269 array( 'rev_id', 'rev_text_id' ),
270 array( 'rev_page' => $pageId ),
271 __METHOD__
272 );
273 foreach ( $res as $row ) {
274 $map[$row->rev_text_id] = $row->rev_id;
275 }
276 $this->mapCache[$pageId] = $map;
277 $this->mapCacheSize += count( $map );
278 }
279 return $this->mapCache[$pageId];
280 }
281
282 /**
283 * This is based on part of HistoryBlobStub::getText().
284 * Determine if the text can be retrieved from the row in the normal way.
285 */
286 function isUnbrokenStub( $stub, $secondaryRow ) {
287 $flags = explode( ',', $secondaryRow->old_flags );
288 $text = $secondaryRow->old_text;
289 if ( in_array( 'external', $flags ) ) {
290 $url = $text;
291 @list( /* $proto */ , $path ) = explode( '://', $url, 2 );
292 if ( $path == "" ) {
293 return false;
294 }
295 $text = ExternalStore::fetchFromUrl( $url );
296 }
297 if ( !in_array( 'object', $flags ) ) {
298 return false;
299 }
300
301 if ( in_array( 'gzip', $flags ) ) {
302 $obj = unserialize( gzinflate( $text ) );
303 } else {
304 $obj = unserialize( $text );
305 }
306
307 if ( !is_object( $obj ) ) {
308 // Correct for old double-serialization bug.
309 $obj = unserialize( $obj );
310 }
311
312 if ( !is_object( $obj ) ) {
313 return false;
314 }
315
316 $obj->uncompress();
317 $text = $obj->getItem( $stub['hash'] );
318 return $text !== false;
319 }
320 }
321
322 $maintClass = 'FixBug20757';
323 require_once( DO_MAINTENANCE );
324