Phpdoc comments and place holder. Part of the subpackage "maintenance", archives...
[lhc/web/wiklou.git] / maintenance / archives / importUseModWiki.php
1 <?php
2 /**
3 * @deprecated
4 * @package MediaWiki
5 * @subpackage MaintenanceArchive
6 */
7
8 /** */
9 print "This script is obsolete!";
10 print "It is retained in the source here in case some of its
11 code might be useful for ad-hoc conversion tasks, but it is
12 not maintained and probably won't even work as is.";
13 exit();
14
15 /*
16 Import data from a UseModWiki into a PediaWiki wiki
17 2003-02-09 Brion VIBBER <brion@pobox.com>
18 Based loosely on Magnus's code from 2001-2002
19
20 Pass one: collect data on links & title case, users
21 Pass two: spit out SQL for
22 Separately, be sure to run the link & index rebuilding scripts!
23
24 */
25
26 /* globals
27 */
28 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
29 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
30 $FS = $wgFieldSeparator ;
31 $FS1 = $FS."1" ;
32 $FS2 = $FS."2" ;
33 $FS3 = $FS."3" ;
34
35 # Images to import
36 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
37
38 # Number of *seconds to add* to timestamp to get UTC/GMT
39 #$wgTimezoneCorrection = 0; # GMT
40 $wgTimezoneCorrection = 8*3600; # PST - California
41
42 # Other options...
43 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
44 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
45
46 /* Vary by language */
47 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
48 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
49 $talkending = "Talk";
50 $mediatext = "Media";
51 $conversionscript = "Conversion script";
52 $conversioncomment = "Automatic conversion";
53 $redirectcomment = "Automatic converion, moved to \$1";
54 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
55
56 # Stats and caches
57 $oldtitles = array();
58 $usercache = array();
59 $titlecache = array();
60 $linkcache = array();
61
62 /**
63 * Some oversimplified test types
64 *
65 * @deprecated
66 * @package MediaWiki
67 * @subpackage MaintenanceArchive
68 */
69 class Title {
70 var $title, $namespace;
71 function fromData( $namespace, $title ) {
72 $x = new Title;
73 $x->namespace = $namespace;
74 $x->title = $title;
75 return $x;
76 }
77 }
78
79 # See tests in importTests.php
80 if( ! $testingonly ) {
81 firstPass();
82 secondPass();
83 }
84
85 # ------------------------------------------------------------------------------
86
87 /* First pass:
88 Information please!
89 */
90 function firstPass()
91 {
92 global $wgRootDirectory, $oldtitles;
93
94 $letters = array(
95 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
96 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
97 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
98 foreach( $letters as $letter ) {
99 firstPassDirectory( "$wgRootDirectory/page/$letter" );
100 }
101 }
102
103 function firstPassDirectory( $dir )
104 {
105 global $titlecache;
106
107 $mydir = opendir( $dir );
108 while( $entry = readdir( $mydir ) ) {
109 if( $entry != '.' && $entry != '..' ) {
110 if( is_dir( "$dir/$entry" ) ) {
111 firstPassDirectory( "$dir/$entry" );
112 }
113 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
114 $titlecache[$title] = transformTitle( $m[1] );
115 countLinksFrom( $title );
116 } else {
117 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
118 }
119 }
120 }
121
122 /* Second pass:
123 make the dang SQL
124 */
125 function secondPass()
126 {
127 global $titlecache, $usercache, $redirects;
128
129 foreach( $usercache as $oldname => $user ) {
130 echo importUser( $oldname );
131 }
132 foreach( $titlecache as $oldtitle => $newtitle ) {
133 echo importPage( $oldtitle );
134 }
135
136 echo "\n-- Done!\n";
137 }
138
139
140 # ------------------------------------------------------------------------------
141
142 /* fetch_ functions
143 Grab a given item from the database
144 */
145 function fetchUser( $uid )
146 {
147 global $FS,$FS2,$FS3, $wgRootDirectory;
148
149 $fname = $wgRootDirectory . "/pages/" . $title;
150 if( !file_exists( $fname ) ) return false;
151
152 $data = splitHash( implode( "", file( $fname ) ) );
153 # enough?
154
155 return $data;
156 }
157
158 function fetchPage( $title )
159 {
160 global $FS,$FS2,$FS3, $wgRootDirectory;
161
162 $fname = $wgRootDirectory . "/pages/" . $title;
163 if( !file_exists( $fname ) ) return false;
164
165 $page = splitHash( implode( "", file( $fname ) ) );
166 $section = splitHash( $FS2, $page["text_default"] );
167 $text = splitHash( $FS3, $section["data"] );
168
169 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
170 "minor" => $text["minor"] , "ts" => $section["ts"] ,
171 "username" => $section["username"] , "host" => $section["host"] ) ;
172 }
173
174 function fetchKeptPages( $title )
175 {
176 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
177
178 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
179 if( !file_exists( $fname ) ) return array();
180
181 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
182 array_shift( $keptlist ); # Drop the junk at beginning of file
183
184 $revisions = array();
185 foreach( $keptlist as $rev ) {
186 $section = splitHash( $FS2, $rev );
187 $text = splitHash( $FS3, $section["data"] );
188 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
189 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
190 "minor" => $text["minor"] , "ts" => $section["ts"] ,
191 "username" => $section["username"] , "host" => $section["host"] ) );
192 } else {
193 echo "-- skipped a bad old revision\n";
194 }
195 }
196 return $revisions;
197 }
198
199 function splitHash ( $sep , $str ) {
200 $temp = explode ( $sep , $str ) ;
201 $ret = array () ;
202 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
203 $ret[$temp[$i]] = $temp[++$i] ;
204 }
205 return $ret ;
206 }
207
208
209 /* import_ functions
210 Take a fetched item and produce SQL
211 */
212
213 /* importUser
214 $uid is the UseMod user id number.
215 The new ones will be assigned arbitrarily and are for internal use only.
216
217 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
218 */
219 function importUser( $uid )
220 {
221 global $last_uid, $user_list, $wgTimestampCorrection;
222
223 return "";
224
225 $stuff = fetchUser( $uid );
226 $last_uid++;
227
228 $name = wfStrencode( $stuff->username );
229 $hash = md5hash( $stuff->password ); # Doable?
230 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
231 $hideminor = ($stuff['rcall'] ? 0 : 1);
232 $options = "cols={$stuff['editcols']}
233 rows={$stuff['editrows']}
234 rcdays={$stuff['rcdays']}
235 timecorrection={$tzoffset}
236 hideminor={$hideminor}
237 ";
238
239 $sql = "INSERT
240 INTO user (user_id,user_name,user_password,user_options)
241 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
242 return $sql;
243 }
244
245 function checkUserCache( $name, $host )
246 {
247 global $usercache;
248
249 if( $name ) {
250 if( in_array( $name, $usercache ) ) {
251 $userid = $usercache[$name];
252 } else {
253 # If we haven't imported user accounts
254 $userid = 0;
255 }
256 $username = wfStrencode( $name );
257 } else {
258 $userid = 0;
259 $username = wfStrencode( $host );
260 }
261 return array( $userid, $username );
262 }
263
264 function importPage( $title )
265 {
266 global $wgTimezoneCorrection, $titlecache, $usercache;
267 global $conversionscript, $conversioncomment, $conversiontime;
268 global $historyonly, $lasthistoryonly;
269
270 $page = fetchPage( $title );
271
272 $newtext = wfStrencode( rewritePage( $title, $page->text ) );
273 $t = renamePage( $title );
274 $newtitle = wfStrencode( $t->title );
275 $namespace = $t->namespace;
276
277 # Current revision:
278 $text = wfStrencode( $page->text );
279 $minor = ($page->minor ? 1 : 0);
280 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
281 $timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection );
282 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
283 $sql = "\n";
284 if( !$historyonly ) {
285 $sql .= "INSERT
286 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
287 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
288 }
289 $sql .= "INSERT
290 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
291 VALUES";
292 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
293
294 # History
295 if( !$lasthistoryonly ) {
296 $revisions = fetchKeptPages( $title );
297 foreach( $revisions as $rev ) {
298 $text = wfStrencode( $rev->text );
299 $minor = ($rev->minor ? 1 : 0);
300 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
301 $timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
302 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
303 }
304 }
305 return $sql . $sqlfinal;
306 }
307
308
309 # Count up basic links
310 function countLinksFrom( $title )
311 {
312 $page = fetchPage( $title );
313 $page->text = preg_replace(
314 '/<nowiki>.*<\/nowiki>/sDU',
315 '',
316 $page->text );
317 $page->text = preg_replace(
318 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
319 'countLinkTo( ucfirst( "$1" ) )',
320 $page->text );
321 }
322
323 function countLinkTo( $title )
324 {
325 global $linkcache;
326 $t = transformTitle( $title );
327 $linkform = FreeToNormal( $t->title );
328 $x = $linkcache[$title];
329 if ( count ( $x ) ) {
330 $y = $x[$linkform] ;
331 if ( $y ) $y++; else $y = 1 ;
332 $x[$linkform] = $y ;
333 } else {
334 $x = array ( $linkform => 1 ) ;
335 }
336 $linkcache[$title] = $x;
337 }
338
339 # Preferentially change case
340 function renamePage( $title )
341 {
342 global $linkcache;
343 $t = transformTitle( $title );
344
345 # We want to use the most frequently linked-to form as the title
346 $maxcount = 0 ; $maxform = $t->title ;
347 foreach ( $linkcache[$title] as $linkform => $count ) {
348 if ( $count > $maxcount ) {
349 $maxcount = $count ;
350 $maxform = $linkform ;
351 }
352 }
353 if( $maxform != $t->title) {
354 doRenamePage( $t, $maxform );
355 }
356 }
357
358 function doRenamePage( $title, $maxform )
359 {
360 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
361 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
362 VALUES ";
363 $redirsql = array();
364 foreach( $linkcache[$title] as $linkform => $count ) {
365 if( $linkform != $maxform ) {
366 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
367 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
368 }
369 }
370 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
371 return $sql;
372 }
373
374 # Account for syntax changes
375 function rewritePage( $title, $text )
376 {
377 # ...
378 $text = removeTalkLink( $text );
379 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
380 'rewritePageBits( $title, "$1")',
381 $text );
382 return $text;
383 }
384
385 function rewritePageBits( $title, $text ) {
386 $text = fixSubpages( $title, $text );
387 $text = fixMedialinks( $text );
388 $text = fixImagelinks( $text );
389 return $text;
390 }
391
392 function removeTalkLink( &$text ) {
393 global $talkending;
394 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
395 }
396
397 function fixSubpages( $text, &$title ) {
398 $old = preg_quote( $text );
399 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
400 "$1[[$title/$2|/$2]]", $text );
401 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
402 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
403 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
404 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
405 return $text;
406 }
407
408 function fixImagelinks( &$text ) {
409 global $imageimport, $namespaces;
410 return preg_replace( "/$imageimport/e",
411 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
412 $text );
413 }
414
415 function fixMedialinks( &$text ) {
416 global $imageimport, $mediatext;
417 $text = preg_replace( "/\[$imageimport\]/e",
418 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
419 $text );
420 return preg_replace( "/\[$imageimport (.+?)\]/e",
421 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
422 $text );
423 }
424
425 function fetchMediaFile( $url, $filename )
426 {
427 # Copy an image file into local upload space
428 # FIXME
429 return ucfirst( $filename );
430 }
431
432 # Simple move of talk pages, etc
433 function transformTitle( $title, $dorename = false )
434 {
435 global $talkending;
436 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
437 $thetitle = $m[1];
438 $namespace = 1;
439 } else {
440 $thetitle = $title;
441 $namespace = 0;
442 }
443 return Title::fromData( $namespace, $thetitle );
444 }
445
446 # Translated out of old usemod wiki...
447 function FreeToNormal ( $id , $FreeUpper = true ) {
448 $id = str_replace ( " ", "_", $id ) ;
449 $id = ucfirst($id);
450 if (strstr($id, '_') != false) { # Quick check for any space/underscores
451 $id = preg_replace ( '/__+/' , "_" , $id ) ;
452 $id = preg_replace ( '/^_/' , "", $id ) ;
453 $id = preg_replace ( '/_$/' , "", $id ) ;
454 #if ($UseSubpage) {
455 $id = preg_replace ( '|_/|', "/" , $id ) ;
456 $id = preg_replace ( '|/_|', "/" , $id ) ;
457 #}
458 }
459 if ($FreeUpper) {
460 # Note that letters after ' are *not* capitalized
461 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
462 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
463 }
464 }
465 return $id;
466 }
467
468 # Whee!
469 function recodeInput( $text )
470 {
471 return $text;
472 }
473
474 function wfUnix2Timestamp( $unixtime ) {
475 return gmdate( "YmdHis", $timestamp );
476 }
477
478 function wfTimestamp2Unix( $ts )
479 {
480 return gmmktime( ( (int)substr( $ts, 8, 2) ),
481 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
482 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
483 (int)substr( $ts, 0, 4 ) );
484 }
485
486 ?>