'Chunked' 1.5 schema upgrader; incomplete, only does the main tables so far.
[lhc/web/wiklou.git] / maintenance / upgrade1_5.php
1 <?php
2
3 // Alternate 1.4 -> 1.5 schema upgrade
4 // This does only the main tables + UTF-8
5 // and is designed to allow upgrades to interleave
6 // with other updates on the replication stream so
7 // that large wikis can be upgraded without disrupting
8 // other services.
9
10 require_once( 'commandLine.inc' );
11
12 $upgrade = new FiveUpgrade();
13 $upgrade->upgrade();
14
15 class FiveUpgrade {
16 function FiveUpgrade() {
17 global $wgDatabase;
18 $this->conversionTables = $this->prepareWindows1252();
19 $this->dbw =& $this->newConnection();
20 $this->dbr =& $this->newConnection();
21 $this->dbr->bufferResults( false );
22 }
23
24 function upgrade() {
25 $this->upgradePage();
26 }
27
28
29 /**
30 * Open a second connection to the master server, with buffering off.
31 * This will let us stream large datasets in and write in chunks on the
32 * other end.
33 * @return Database
34 * @access private
35 */
36 function &newConnection() {
37 global $wgDBadminuser, $wgDBadminpassword;
38 global $wgDBserver, $wgDBname;
39 $db =& new Database( $wgDBserver, $wgDBadminuser, $wgDBadminpassword, $wgDBname );
40 return $db;
41 }
42
43 /**
44 * Prepare a conversion array for converting Windows Code Page 1252 to
45 * UTF-8. This should provide proper conversion of text that was miscoded
46 * as Windows-1252 by naughty user-agents, and doesn't rely on an outside
47 * iconv library.
48 *
49 * @return array
50 * @access private
51 */
52 function prepareWindows1252() {
53 # Mappings from:
54 # http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
55 static $cp1252 = array(
56 0x80 => 0x20AC, #EURO SIGN
57 0x81 => UNICODE_REPLACEMENT,
58 0x82 => 0x201A, #SINGLE LOW-9 QUOTATION MARK
59 0x83 => 0x0192, #LATIN SMALL LETTER F WITH HOOK
60 0x84 => 0x201E, #DOUBLE LOW-9 QUOTATION MARK
61 0x85 => 0x2026, #HORIZONTAL ELLIPSIS
62 0x86 => 0x2020, #DAGGER
63 0x87 => 0x2021, #DOUBLE DAGGER
64 0x88 => 0x02C6, #MODIFIER LETTER CIRCUMFLEX ACCENT
65 0x89 => 0x2030, #PER MILLE SIGN
66 0x8A => 0x0160, #LATIN CAPITAL LETTER S WITH CARON
67 0x8B => 0x2039, #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
68 0x8C => 0x0152, #LATIN CAPITAL LIGATURE OE
69 0x8D => UNICODE_REPLACEMENT,
70 0x8E => 0x017D, #LATIN CAPITAL LETTER Z WITH CARON
71 0x8F => UNICODE_REPLACEMENT,
72 0x90 => UNICODE_REPLACEMENT,
73 0x91 => 0x2018, #LEFT SINGLE QUOTATION MARK
74 0x92 => 0x2019, #RIGHT SINGLE QUOTATION MARK
75 0x93 => 0x201C, #LEFT DOUBLE QUOTATION MARK
76 0x94 => 0x201D, #RIGHT DOUBLE QUOTATION MARK
77 0x95 => 0x2022, #BULLET
78 0x96 => 0x2013, #EN DASH
79 0x97 => 0x2014, #EM DASH
80 0x98 => 0x02DC, #SMALL TILDE
81 0x99 => 0x2122, #TRADE MARK SIGN
82 0x9A => 0x0161, #LATIN SMALL LETTER S WITH CARON
83 0x9B => 0x203A, #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
84 0x9C => 0x0153, #LATIN SMALL LIGATURE OE
85 0x9D => UNICODE_REPLACEMENT,
86 0x9E => 0x017E, #LATIN SMALL LETTER Z WITH CARON
87 0x9F => 0x0178, #LATIN CAPITAL LETTER Y WITH DIAERESIS
88 );
89 $pairs = array();
90 for( $i = 0; $i < 0x100; $i++ ) {
91 $unicode = isset( $cp1252[$i] ) ? $cp1252[$i] : $i;
92 $pairs[chr( $i )] = codepointToUtf8( $unicode );
93 }
94 return $pairs;
95 }
96
97 /**
98 * Convert from 8-bit Windows-1252 to UTF-8 if necessary.
99 * @param string $text
100 * @return string
101 * @access private
102 */
103 function conv( $text ) {
104 global $wgUseLatin1;
105 if( $wgUseLatin1 ) {
106 return strtr( $text, $this->conversionTables );
107 } else {
108 return $text;
109 }
110 }
111
112 /**
113 * Dump timestamp and message to output
114 * @param string $message
115 * @access private
116 */
117 function log( $message ) {
118 echo wfTimestamp( TS_DB ) . ': ' . $message . "\n";
119 flush();
120 }
121
122 /**
123 * Initialize the chunked-insert system.
124 * Rows will be inserted in chunks of the given number, rather
125 * than in a giant INSERT...SELECT query, to keep the serialized
126 * MySQL database replication from getting hung up. This way other
127 * things can be going on during conversion without waiting for
128 * slaves to catch up as badly.
129 *
130 * @param int $chunksize Number of rows to insert at once
131 * @param int $final Total expected number of rows / id of last row,
132 * used for progress reports.
133 * @access private
134 */
135 function setChunkScale( $chunksize, $final ) {
136 $this->chunkSize = $chunksize;
137 $this->chunkFinal = $final;
138 $this->chunkCount = 0;
139 $this->chunkStartTime = wfTime();
140 }
141
142 /**
143 * Chunked inserts: perform an insert if we've reached the chunk limit.
144 * Prints a progress report with estimated completion time.
145 * @param string $table
146 * @param array &$chunk -- This will be emptied if an insert is done.
147 * @param string $fname function name to report in SQL
148 * @param int $key A key identifier to use in progress estimation in
149 * place of the number of rows inserted. Use this if
150 * you provided a max key number instead of a count
151 * as the final chunk number in setChunkScale()
152 * @access private
153 */
154 function addChunk( $table, &$chunk, $fname, $key = null ) {
155 if( count( $chunk ) >= $this->chunkSize ) {
156 $this->insertChunk( $table, $chunk, $fname );
157
158 $this->chunkCount += count( $chunk );
159 $now = wfTime();
160 $delta = $now - $this->chunkStartTime;
161 $rate = $this->chunkCount / $delta;
162
163 if( is_null( $key ) ) {
164 $completed = $this->chunkCount;
165 } else {
166 $completed = $key;
167 }
168 $portion = $completed / $this->chunkFinal;
169
170 $estimatedTotalTime = $delta / $portion;
171 $eta = $now + $estimatedTotalTime;
172
173 printf( "%s: %3.2f%% done on %s; ETA %s [%d/%d] %.2f/sec\n",
174 wfTimestamp( TS_DB, intval( $now ) ),
175 $portion * 100.0,
176 $table,
177 wfTimestamp( TS_DB, intval( $eta ) ),
178 $completed,
179 $this->chunkFinal,
180 $rate );
181 flush();
182
183 $chunk = array();
184 }
185 }
186
187 /**
188 * Chunked inserts: perform an insert unconditionally, at the end, and log.
189 * @param string $table
190 * @param array &$chunk -- This will be emptied if an insert is done.
191 * @param string $fname function name to report in SQL
192 * @access private
193 */
194 function lastChunk( $table, &$chunk, $fname ) {
195 $n = count( $chunk );
196 if( $n > 0 ) {
197 $this->insertChunk( $table, $chunk, $fname );
198 }
199 $this->log( "100% done on $table (last chunk $n rows)." );
200 }
201
202 /**
203 * Chunked inserts: perform an insert.
204 * @param string $table
205 * @param array &$chunk -- This will be emptied if an insert is done.
206 * @param string $fname function name to report in SQL
207 * @access private
208 */
209 function insertChunk( $table, &$chunk, $fname ) {
210 $this->dbw->insert( $table, $chunk, $fname );
211 }
212
213 function upgradePage() {
214 $fname = "FiveUpgrade::upgradePage";
215 $chunksize = 500;
216
217 $this->log( "...converting from cur/old to page/revision/text DB structure." );
218
219 extract( $this->dbw->tableNames( 'cur', 'old', 'page', 'revision', 'text' ) );
220
221 $this->log( "Creating page and revision tables..." );
222 $this->dbw->query("CREATE TABLE $page (
223 page_id int(8) unsigned NOT NULL auto_increment,
224 page_namespace int NOT NULL,
225 page_title varchar(255) binary NOT NULL,
226 page_restrictions tinyblob NOT NULL default '',
227 page_counter bigint(20) unsigned NOT NULL default '0',
228 page_is_redirect tinyint(1) unsigned NOT NULL default '0',
229 page_is_new tinyint(1) unsigned NOT NULL default '0',
230 page_random real unsigned NOT NULL,
231 page_touched char(14) binary NOT NULL default '',
232 page_latest int(8) unsigned NOT NULL,
233 page_len int(8) unsigned NOT NULL,
234
235 PRIMARY KEY page_id (page_id),
236 UNIQUE INDEX name_title (page_namespace,page_title),
237 INDEX (page_random),
238 INDEX (page_len)
239 ) TYPE=InnoDB", $fname );
240 $this->dbw->query("CREATE TABLE $revision (
241 rev_id int(8) unsigned NOT NULL auto_increment,
242 rev_page int(8) unsigned NOT NULL,
243 rev_comment tinyblob NOT NULL default '',
244 rev_user int(5) unsigned NOT NULL default '0',
245 rev_user_text varchar(255) binary NOT NULL default '',
246 rev_timestamp char(14) binary NOT NULL default '',
247 rev_minor_edit tinyint(1) unsigned NOT NULL default '0',
248 rev_deleted tinyint(1) unsigned NOT NULL default '0',
249
250 PRIMARY KEY rev_page_id (rev_page, rev_id),
251 UNIQUE INDEX rev_id (rev_id),
252 INDEX rev_timestamp (rev_timestamp),
253 INDEX page_timestamp (rev_page,rev_timestamp),
254 INDEX user_timestamp (rev_user,rev_timestamp),
255 INDEX usertext_timestamp (rev_user_text,rev_timestamp)
256 ) TYPE=InnoDB", $fname );
257
258 $maxold = $this->dbw->selectField( 'old', 'max(old_id)', '', $fname );
259 $this->log( "Last old record is {$maxold}" );
260
261 global $wgLegacySchemaConversion;
262 if( $wgLegacySchemaConversion ) {
263 // Create HistoryBlobCurStub entries.
264 // Text will be pulled from the leftover 'cur' table at runtime.
265 echo "......Moving metadata from cur; using blob references to text in cur table.\n";
266 $cur_text = "concat('O:18:\"historyblobcurstub\":1:{s:6:\"mCurId\";i:',cur_id,';}')";
267 $cur_flags = "'object'";
268 } else {
269 // Copy all cur text in immediately: this may take longer but avoids
270 // having to keep an extra table around.
271 echo "......Moving text from cur.\n";
272 $cur_text = 'cur_text';
273 $cur_flags = "''";
274 }
275
276 $maxcur = $this->dbw->selectField( 'cur', 'max(cur_id)', '', $fname );
277 $this->log( "Last cur entry is $maxcur" );
278
279 /**
280 * Copy placeholder records for each page's current version into old
281 * Don't do any conversion here; text records are converted at runtime
282 * based on the flags (and may be originally binary!) while the meta
283 * fields will be converted in the old -> rev and cur -> page steps.
284 */
285 $this->setChunkScale( $chunksize, $maxcur );
286 $result = $this->dbr->query(
287 "SELECT cur_id, cur_namespace, cur_title, $cur_text AS text, cur_comment,
288 cur_user, cur_user_text, cur_timestamp, cur_minor_edit, $cur_flags AS flags
289 FROM $cur
290 ORDER BY cur_id", $fname );
291 $add = array();
292 while( $row = $this->dbr->fetchObject( $result ) ) {
293 $add[] = array(
294 'old_namespace' => $row->cur_namespace,
295 'old_title' => $row->cur_title,
296 'old_text' => $row->text,
297 'old_comment' => $row->cur_comment,
298 'old_user' => $row->cur_user,
299 'old_user_text' => $row->cur_user_text,
300 'old_timestamp' => $row->cur_timestamp,
301 'old_minor_edit' => $row->cur_minor_edit,
302 'old_flags' => $row->flags );
303 $this->addChunk( 'old', $add, $fname, $row->cur_id );
304 }
305 $this->lastChunk( 'old', $add, $fname );
306 $this->dbr->freeResult( $result );
307
308 /**
309 * Copy revision metadata from old into revision.
310 * We'll also do UTF-8 conversion of usernames and comments.
311 */
312 $newmaxold = $this->dbw->selectField( 'old', 'max(old_id)', '', $fname );
313 $this->setChunkScale( $chunksize, $newmaxold );
314
315 $this->log( "......Setting up revision table." );
316 $result = $this->dbr->query(
317 "SELECT old_id, cur_id, old_comment, old_user, old_user_text,
318 old_timestamp, old_minor_edit
319 FROM $old,$cur WHERE old_namespace=cur_namespace AND old_title=cur_title
320 ORDER BY old_id", $fname );
321
322 $add = array();
323 while( $row = $this->dbr->fetchObject( $result ) ) {
324 $add[] = array(
325 'rev_id' => $row->old_id,
326 'rev_page' => $row->cur_id,
327 'rev_comment' => $this->conv( $row->old_comment ),
328 'rev_user' => $row->old_user,
329 'rev_user_text' => $this->conv( $row->old_user_text ),
330 'rev_timestamp' => $row->old_timestamp,
331 'rev_minor_edit' => $row->old_minor_edit );
332 $this->addChunk( 'revision', $add, $fname, $row->old_id );
333 }
334 $this->lastChunk( 'revision', $add, $fname );
335 $this->dbr->freeResult( $result );
336
337
338 /**
339 * Copy revision metadata from cur into page.
340 * We'll also do UTF-8 conversion of titles.
341 */
342 $this->log( "......Setting up page table." );
343 $this->setChunkScale( $chunksize, $maxcur );
344 $result = $this->dbr->query( "
345 SELECT cur_id, cur_namespace, cur_title, cur_restrictions, cur_counter, cur_is_redirect, cur_is_new,
346 cur_random, cur_touched, rev_id, LENGTH(cur_text) AS len
347 FROM $cur,$revision
348 WHERE cur_id=rev_page AND rev_timestamp=cur_timestamp AND rev_id > {$maxold}
349 ORDER BY cur_id", $fname );
350 $add = array();
351 while( $row = $this->dbr->fetchObject( $result ) ) {
352 $add[] = array(
353 'page_id' => $row->cur_id,
354 'page_namespace' => $row->cur_namespace,
355 'page_title' => $this->conv( $row->cur_title ),
356 'page_restrictions' => $row->cur_restrictions,
357 'page_counter' => $row->cur_counter,
358 'page_is_redirect' => $row->cur_is_redirect,
359 'page_is_new' => $row->cur_is_new,
360 'page_random' => $row->cur_random,
361 'page_touched' => $this->dbw->timestamp(),
362 'page_latest' => $row->rev_id,
363 'page_len' => $row->len );
364 $this->addChunk( 'page', $add, $fname, $row->cur_id );
365 }
366 $this->lastChunk( 'page', $add, $fname );
367 $this->dbr->freeResult( $result );
368
369 $this->log( "......Renaming old." );
370 $this->dbw->query( "ALTER TABLE $old RENAME TO $text", $fname );
371
372 $this->log( "...done." );
373 }
374 }
375
376 ?>