Script for creating the validate-table
[lhc/web/wiklou.git] / maintenance / archives / importUseModWiki.php
1 <?php
2
3 print "This script is obsolete!";
4 print "It is retained in the source here in case some of its
5 code might be useful for ad-hoc conversion tasks, but it is
6 not maintained and probably won't even work as is.";
7 exit();
8
9 /*
10 Import data from a UseModWiki into a PediaWiki wiki
11 2003-02-09 Brion VIBBER <brion@pobox.com>
12 Based loosely on Magnus's code from 2001-2002
13
14 Pass one: collect data on links & title case, users
15 Pass two: spit out SQL for
16 Separately, be sure to run the link & index rebuilding scripts!
17
18 */
19
20 /* globals
21 */
22 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
23 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
24 $FS = $wgFieldSeparator ;
25 $FS1 = $FS."1" ;
26 $FS2 = $FS."2" ;
27 $FS3 = $FS."3" ;
28
29 # Images to import
30 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
31
32 # Number of *seconds to add* to timestamp to get UTC/GMT
33 #$wgTimezoneCorrection = 0; # GMT
34 $wgTimezoneCorrection = 8*3600; # PST - California
35
36 # Other options...
37 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
38 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
39
40 /* Vary by language */
41 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
42 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
43 $talkending = "Talk";
44 $mediatext = "Media";
45 $conversionscript = "Conversion script";
46 $conversioncomment = "Automatic conversion";
47 $redirectcomment = "Automatic converion, moved to \$1";
48 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
49
50 # Stats and caches
51 $oldtitles = array();
52 $usercache = array();
53 $titlecache = array();
54 $linkcache = array();
55
56 # Some oversimplified test types
57 class Title {
58 var $title, $namespace;
59 function fromData( $namespace, $title ) {
60 $x = new Title;
61 $x->namespace = $namespace;
62 $x->title = $title;
63 return $x;
64 }
65 }
66
67 # See tests in importTests.php
68 if( ! $testingonly ) {
69 firstPass();
70 secondPass();
71 }
72
73 # ------------------------------------------------------------------------------
74
75 /* First pass:
76 Information please!
77 */
78 function firstPass()
79 {
80 global $wgRootDirectory, $oldtitles;
81
82 $letters = array(
83 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
84 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
85 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
86 foreach( $letters as $letter ) {
87 firstPassDirectory( "$wgRootDirectory/page/$letter" );
88 }
89 }
90
91 function firstPassDirectory( $dir )
92 {
93 global $titlecache;
94
95 $mydir = opendir( $dir );
96 while( $entry = readdir( $mydir ) ) {
97 if( $entry != '.' && $entry != '..' ) {
98 if( is_dir( "$dir/$entry" ) ) {
99 firstPassDirectory( "$dir/$entry" );
100 }
101 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
102 $titlecache[$title] = transformTitle( $m[1] );
103 countLinksFrom( $title );
104 } else {
105 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
106 }
107 }
108 }
109
110 /* Second pass:
111 make the dang SQL
112 */
113 function secondPass()
114 {
115 global $titlecache, $usercache, $redirects;
116
117 foreach( $usercache as $oldname => $user ) {
118 echo importUser( $oldname );
119 }
120 foreach( $titlecache as $oldtitle => $newtitle ) {
121 echo importPage( $oldtitle );
122 }
123
124 echo "\n-- Done!\n";
125 }
126
127
128 # ------------------------------------------------------------------------------
129
130 /* fetch_ functions
131 Grab a given item from the database
132 */
133 function fetchUser( $uid )
134 {
135 global $FS,$FS2,$FS3, $wgRootDirectory;
136
137 $fname = $wgRootDirectory . "/pages/" . $title;
138 if( !file_exists( $fname ) ) return false;
139
140 $data = splitHash( implode( "", file( $fname ) ) );
141 # enough?
142
143 return $data;
144 }
145
146 function fetchPage( $title )
147 {
148 global $FS,$FS2,$FS3, $wgRootDirectory;
149
150 $fname = $wgRootDirectory . "/pages/" . $title;
151 if( !file_exists( $fname ) ) return false;
152
153 $page = splitHash( implode( "", file( $fname ) ) );
154 $section = splitHash( $FS2, $page["text_default"] );
155 $text = splitHash( $FS3, $section["data"] );
156
157 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
158 "minor" => $text["minor"] , "ts" => $section["ts"] ,
159 "username" => $section["username"] , "host" => $section["host"] ) ;
160 }
161
162 function fetchKeptPages( $title )
163 {
164 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
165
166 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
167 if( !file_exists( $fname ) ) return array();
168
169 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
170 array_shift( $keptlist ); # Drop the junk at beginning of file
171
172 $revisions = array();
173 foreach( $keptlist as $rev ) {
174 $section = splitHash( $FS2, $rev );
175 $text = splitHash( $FS3, $section["data"] );
176 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
177 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
178 "minor" => $text["minor"] , "ts" => $section["ts"] ,
179 "username" => $section["username"] , "host" => $section["host"] ) );
180 } else {
181 echo "-- skipped a bad old revision\n";
182 }
183 }
184 return $revisions;
185 }
186
187 function splitHash ( $sep , $str ) {
188 $temp = explode ( $sep , $str ) ;
189 $ret = array () ;
190 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
191 $ret[$temp[$i]] = $temp[++$i] ;
192 }
193 return $ret ;
194 }
195
196
197 /* import_ functions
198 Take a fetched item and produce SQL
199 */
200
201 /* importUser
202 $uid is the UseMod user id number.
203 The new ones will be assigned arbitrarily and are for internal use only.
204
205 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
206 */
207 function importUser( $uid )
208 {
209 global $last_uid, $user_list, $wgTimestampCorrection;
210
211 return "";
212
213 $stuff = fetchUser( $uid );
214 $last_uid++;
215
216 $name = wfStrencode( $stuff->username );
217 $hash = md5hash( $stuff->password ); # Doable?
218 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
219 $hideminor = ($stuff['rcall'] ? 0 : 1);
220 $options = "cols={$stuff['editcols']}
221 rows={$stuff['editrows']}
222 rcdays={$stuff['rcdays']}
223 timecorrection={$tzoffset}
224 hideminor={$hideminor}
225 ";
226
227 $sql = "INSERT
228 INTO user (user_id,user_name,user_password,user_options)
229 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
230 return $sql;
231 }
232
233 function checkUserCache( $name, $host )
234 {
235 global $usercache;
236
237 if( $name ) {
238 if( in_array( $name, $usercache ) ) {
239 $userid = $usercache[$name];
240 } else {
241 # If we haven't imported user accounts
242 $userid = 0;
243 }
244 $username = wfStrencode( $name );
245 } else {
246 $userid = 0;
247 $username = wfStrencode( $host );
248 }
249 return array( $userid, $username );
250 }
251
252 function importPage( $title )
253 {
254 global $wgTimezoneCorrection, $titlecache, $usercache;
255 global $conversionscript, $conversioncomment, $conversiontime;
256 global $historyonly, $lasthistoryonly;
257
258 $page = fetchPage( $title );
259
260 $newtext = wfStrencode( rewritePage( $title, $page->text ) );
261 $t = renamePage( $title );
262 $newtitle = wfStrencode( $t->title );
263 $namespace = $t->namespace;
264
265 # Current revision:
266 $text = wfStrencode( $page->text );
267 $minor = ($page->minor ? 1 : 0);
268 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
269 $timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection );
270 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
271 $sql = "\n";
272 if( !$historyonly ) {
273 $sql .= "INSERT
274 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
275 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
276 }
277 $sql .= "INSERT
278 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
279 VALUES";
280 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
281
282 # History
283 if( !$lasthistoryonly ) {
284 $revisions = fetchKeptPages( $title );
285 foreach( $revisions as $rev ) {
286 $text = wfStrencode( $rev->text );
287 $minor = ($rev->minor ? 1 : 0);
288 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
289 $timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
290 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
291 }
292 }
293 return $sql . $sqlfinal;
294 }
295
296
297 # Count up basic links
298 function countLinksFrom( $title )
299 {
300 $page = fetchPage( $title );
301 $page->text = preg_replace(
302 '/<nowiki>.*<\/nowiki>/sDU',
303 '',
304 $page->text );
305 $page->text = preg_replace(
306 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
307 'countLinkTo( ucfirst( "$1" ) )',
308 $page->text );
309 }
310
311 function countLinkTo( $title )
312 {
313 global $linkcache;
314 $t = transformTitle( $title );
315 $linkform = FreeToNormal( $t->title );
316 $x = $linkcache[$title];
317 if ( count ( $x ) ) {
318 $y = $x[$linkform] ;
319 if ( $y ) $y++; else $y = 1 ;
320 $x[$linkform] = $y ;
321 } else {
322 $x = array ( $linkform => 1 ) ;
323 }
324 $linkcache[$title] = $x;
325 }
326
327 # Preferentially change case
328 function renamePage( $title )
329 {
330 global $linkcache;
331 $t = transformTitle( $title );
332
333 # We want to use the most frequently linked-to form as the title
334 $maxcount = 0 ; $maxform = $t->title ;
335 foreach ( $linkcache[$title] as $linkform => $count ) {
336 if ( $count > $maxcount ) {
337 $maxcount = $count ;
338 $maxform = $linkform ;
339 }
340 }
341 if( $maxform != $t->title) {
342 doRenamePage( $t, $maxform );
343 }
344 }
345
346 function doRenamePage( $title, $maxform )
347 {
348 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
349 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
350 VALUES ";
351 $redirsql = array();
352 foreach( $linkcache[$title] as $linkform => $count ) {
353 if( $linkform != $maxform ) {
354 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
355 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
356 }
357 }
358 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
359 return $sql;
360 }
361
362 # Account for syntax changes
363 function rewritePage( $title, $text )
364 {
365 # ...
366 $text = removeTalkLink( $text );
367 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
368 'rewritePageBits( $title, "$1")',
369 $text );
370 return $text;
371 }
372
373 function rewritePageBits( $title, $text ) {
374 $text = fixSubpages( $title, $text );
375 $text = fixMedialinks( $text );
376 $text = fixImagelinks( $text );
377 return $text;
378 }
379
380 function removeTalkLink( &$text ) {
381 global $talkending;
382 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
383 }
384
385 function fixSubpages( $text, &$title ) {
386 $old = preg_quote( $text );
387 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
388 "$1[[$title/$2|/$2]]", $text );
389 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
390 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
391 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
392 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
393 return $text;
394 }
395
396 function fixImagelinks( &$text ) {
397 global $imageimport, $namespaces;
398 return preg_replace( "/$imageimport/e",
399 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
400 $text );
401 }
402
403 function fixMedialinks( &$text ) {
404 global $imageimport, $mediatext;
405 $text = preg_replace( "/\[$imageimport\]/e",
406 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
407 $text );
408 return preg_replace( "/\[$imageimport (.+?)\]/e",
409 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
410 $text );
411 }
412
413 function fetchMediaFile( $url, $filename )
414 {
415 # Copy an image file into local upload space
416 # FIXME
417 return ucfirst( $filename );
418 }
419
420 # Simple move of talk pages, etc
421 function transformTitle( $title, $dorename = false )
422 {
423 global $talkending;
424 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
425 $thetitle = $m[1];
426 $namespace = 1;
427 } else {
428 $thetitle = $title;
429 $namespace = 0;
430 }
431 return Title::fromData( $namespace, $thetitle );
432 }
433
434 # Translated out of old usemod wiki...
435 function FreeToNormal ( $id , $FreeUpper = true ) {
436 $id = str_replace ( " ", "_", $id ) ;
437 $id = ucfirst($id);
438 if (strstr($id, '_') != false) { # Quick check for any space/underscores
439 $id = preg_replace ( '/__+/' , "_" , $id ) ;
440 $id = preg_replace ( '/^_/' , "", $id ) ;
441 $id = preg_replace ( '/_$/' , "", $id ) ;
442 #if ($UseSubpage) {
443 $id = preg_replace ( '|_/|', "/" , $id ) ;
444 $id = preg_replace ( '|/_|', "/" , $id ) ;
445 #}
446 }
447 if ($FreeUpper) {
448 # Note that letters after ' are *not* capitalized
449 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
450 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
451 }
452 }
453 return $id;
454 }
455
456 # Whee!
457 function recodeInput( $text )
458 {
459 return $text;
460 }
461
462 function wfUnix2Timestamp( $unixtime ) {
463 return gmdate( "YmdHis", $timestamp );
464 }
465
466 function wfTimestamp2Unix( $ts )
467 {
468 return gmmktime( ( (int)substr( $ts, 8, 2) ),
469 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
470 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
471 (int)substr( $ts, 0, 4 ) );
472 }
473
474 ?>