Article.php:
[lhc/web/wiklou.git] / maintenance / archives / importUseModWiki.php
1 <?php
2
3 /*
4 Import data from a UseModWiki into a PediaWiki wiki
5 2003-02-09 Brion VIBBER <brion@pobox.com>
6 Based loosely on Magnus's code from 2001-2002
7
8 Pass one: collect data on links & title case, users
9 Pass two: spit out SQL for
10 Separately, be sure to run the link & index rebuilding scripts!
11
12 */
13
14 /* globals
15 */
16 $wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
17 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
18 $FS = $wgFieldSeparator ;
19 $FS1 = $FS."1" ;
20 $FS2 = $FS."2" ;
21 $FS3 = $FS."3" ;
22
23 # Images to import
24 $imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
25
26 # Number of *seconds to add* to timestamp to get UTC/GMT
27 #$wgTimezoneCorrection = 0; # GMT
28 $wgTimezoneCorrection = 8*3600; # PST - California
29
30 # Other options...
31 $historyonly = false; # Don't add converted revisions to cur table; just get old histories
32 $lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
33
34 /* Vary by language */
35 $namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
36 => "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
37 $talkending = "Talk";
38 $mediatext = "Media";
39 $conversionscript = "Conversion script";
40 $conversioncomment = "Automatic conversion";
41 $redirectcomment = "Automatic converion, moved to \$1";
42 $conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
43
44 # Stats and caches
45 $oldtitles = array();
46 $usercache = array();
47 $titlecache = array();
48 $linkcache = array();
49
50 # Some oversimplified test types
51 class Title {
52 var $title, $namespace;
53 function fromData( $namespace, $title ) {
54 $x = new Title;
55 $x->namespace = $namespace;
56 $x->title = $title;
57 return $x;
58 }
59 }
60
61 # See tests in importTests.php
62 if( ! $testingonly ) {
63 firstPass();
64 secondPass();
65 }
66
67 # ------------------------------------------------------------------------------
68
69 /* First pass:
70 Information please!
71 */
72 function firstPass()
73 {
74 global $wgRootDirectory, $oldtitles;
75
76 $letters = array(
77 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
78 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
79 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
80 foreach( $letters as $letter ) {
81 firstPassDirectory( "$wgRootDirectory/page/$letter" );
82 }
83 }
84
85 function firstPassDirectory( $dir )
86 {
87 global $titlecache;
88
89 $mydir = opendir( $dir );
90 while( $entry = readdir( $mydir ) ) {
91 if( $entry != '.' && $entry != '..' ) {
92 if( is_dir( "$dir/$entry" ) ) {
93 firstPassDirectory( "$dir/$entry" );
94 }
95 } elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
96 $titlecache[$title] = transformTitle( $m[1] );
97 countLinksFrom( $title );
98 } else {
99 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
100 }
101 }
102 }
103
104 /* Second pass:
105 make the dang SQL
106 */
107 function secondPass()
108 {
109 global $titlecache, $usercache, $redirects;
110
111 foreach( $usercache as $oldname => $user ) {
112 echo importUser( $oldname );
113 }
114 foreach( $titlecache as $oldtitle => $newtitle ) {
115 echo importPage( $oldtitle );
116 }
117
118 echo "\n-- Done!\n";
119 }
120
121
122 # ------------------------------------------------------------------------------
123
124 /* fetch_ functions
125 Grab a given item from the database
126 */
127 function fetchUser( $uid )
128 {
129 global $FS,$FS2,$FS3, $wgRootDirectory;
130
131 $fname = $wgRootDirectory . "/pages/" . $title;
132 if( !file_exists( $fname ) ) return false;
133
134 $data = splitHash( implode( "", file( $fname ) ) );
135 # enough?
136
137 return $data;
138 }
139
140 function fetchPage( $title )
141 {
142 global $FS,$FS2,$FS3, $wgRootDirectory;
143
144 $fname = $wgRootDirectory . "/pages/" . $title;
145 if( !file_exists( $fname ) ) return false;
146
147 $page = splitHash( implode( "", file( $fname ) ) );
148 $section = splitHash( $FS2, $page["text_default"] );
149 $text = splitHash( $FS3, $section["data"] );
150
151 return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
152 "minor" => $text["minor"] , "ts" => $section["ts"] ,
153 "username" => $section["username"] , "host" => $section["host"] ) ;
154 }
155
156 function fetchKeptPages( $title )
157 {
158 global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
159
160 $fname = $wgRootDirectory . "/keep/" . $title . ".kp";
161 if( !file_exists( $fname ) ) return array();
162
163 $keptlist = explode( $FS1, implode( "", file( $fname ) ) );
164 array_shift( $keptlist ); # Drop the junk at beginning of file
165
166 $revisions = array();
167 foreach( $keptlist as $rev ) {
168 $section = splitHash( $FS2, $rev );
169 $text = splitHash( $FS3, $section["data"] );
170 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
171 array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
172 "minor" => $text["minor"] , "ts" => $section["ts"] ,
173 "username" => $section["username"] , "host" => $section["host"] ) );
174 } else {
175 echo "-- skipped a bad old revision\n";
176 }
177 }
178 return $revisions;
179 }
180
181 function splitHash ( $sep , $str ) {
182 $temp = explode ( $sep , $str ) ;
183 $ret = array () ;
184 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
185 $ret[$temp[$i]] = $temp[++$i] ;
186 }
187 return $ret ;
188 }
189
190
191 /* import_ functions
192 Take a fetched item and produce SQL
193 */
194
195 /* importUser
196 $uid is the UseMod user id number.
197 The new ones will be assigned arbitrarily and are for internal use only.
198
199 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
200 */
201 function importUser( $uid )
202 {
203 global $last_uid, $user_list, $wgTimestampCorrection;
204
205 return "";
206
207 $stuff = fetchUser( $uid );
208 $last_uid++;
209
210 $name = wfStrencode( $stuff->username );
211 $hash = md5hash( $stuff->password ); # Doable?
212 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
213 $hideminor = ($stuff['rcall'] ? 0 : 1);
214 $options = "cols={$stuff['editcols']}
215 rows={$stuff['editrows']}
216 rcdays={$stuff['rcdays']}
217 timecorrection={$tzoffset}
218 hideminor={$hideminor}
219 ";
220
221 $sql = "INSERT
222 INTO user (user_id,user_name,user_password,user_options)
223 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
224 return $sql;
225 }
226
227 function checkUserCache( $name, $host )
228 {
229 global $usercache;
230
231 if( $name ) {
232 if( in_array( $name, $usercache ) ) {
233 $userid = $usercache[$name];
234 } else {
235 # If we haven't imported user accounts
236 $userid = 0;
237 }
238 $username = wfStrencode( $name );
239 } else {
240 $userid = 0;
241 $username = wfStrencode( $host );
242 }
243 return array( $userid, $username );
244 }
245
246 function importPage( $title )
247 {
248 global $wgTimezoneCorrection, $titlecache, $usercache;
249 global $conversionscript, $conversioncomment, $conversiontime;
250 global $historyonly, $lasthistoryonly;
251
252 $page = fetchPage( $title );
253
254 $newtext = wfStrencode( rewritePage( $title, $page->text ) );
255 $t = renamePage( $title );
256 $newtitle = wfStrencode( $t->title );
257 $namespace = $t->namespace;
258
259 # Current revision:
260 $text = wfStrencode( $page->text );
261 $minor = ($page->minor ? 1 : 0);
262 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
263 $timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection );
264 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
265 $sql = "\n";
266 if( !$historyonly ) {
267 $sql .= "INSERT
268 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
269 VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
270 }
271 $sql .= "INSERT
272 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
273 VALUES";
274 $sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
275
276 # History
277 if( !$lasthistoryonly ) {
278 $revisions = fetchKeptPages( $title );
279 foreach( $revisions as $rev ) {
280 $text = wfStrencode( $rev->text );
281 $minor = ($rev->minor ? 1 : 0);
282 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
283 $timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
284 $sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
285 }
286 }
287 return $sql . $sqlfinal;
288 }
289
290
291 # Count up basic links
292 function countLinksFrom( $title )
293 {
294 $page = fetchPage( $title );
295 $page->text = preg_replace(
296 '/<nowiki>.*<\/nowiki>/sDU',
297 '',
298 $page->text );
299 $page->text = preg_replace(
300 '/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
301 'countLinkTo( ucfirst( "$1" ) )',
302 $page->text );
303 }
304
305 function countLinkTo( $title )
306 {
307 global $linkcache;
308 $t = transformTitle( $title );
309 $linkform = FreeToNormal( $t->title );
310 $x = $linkcache[$title];
311 if ( count ( $x ) ) {
312 $y = $x[$linkform] ;
313 if ( $y ) $y++; else $y = 1 ;
314 $x[$linkform] = $y ;
315 } else {
316 $x = array ( $linkform => 1 ) ;
317 }
318 $linkcache[$title] = $x;
319 }
320
321 # Preferentially change case
322 function renamePage( $title )
323 {
324 global $linkcache;
325 $t = transformTitle( $title );
326
327 # We want to use the most frequently linked-to form as the title
328 $maxcount = 0 ; $maxform = $t->title ;
329 foreach ( $linkcache[$title] as $linkform => $count ) {
330 if ( $count > $maxcount ) {
331 $maxcount = $count ;
332 $maxform = $linkform ;
333 }
334 }
335 if( $maxform != $t->title) {
336 doRenamePage( $t, $maxform );
337 }
338 }
339
340 function doRenamePage( $title, $maxform )
341 {
342 global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
343 $sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
344 VALUES ";
345 $redirsql = array();
346 foreach( $linkcache[$title] as $linkform => $count ) {
347 if( $linkform != $maxform ) {
348 $comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
349 array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
350 }
351 }
352 $sql .= implode( ",\n\t", $redirsql ) . ";\n";
353 return $sql;
354 }
355
356 # Account for syntax changes
357 function rewritePage( $title, $text )
358 {
359 # ...
360 $text = removeTalkLink( $text );
361 $text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
362 'rewritePageBits( $title, "$1")',
363 $text );
364 return $text;
365 }
366
367 function rewritePageBits( $title, $text ) {
368 $text = fixSubpages( $title, $text );
369 $text = fixMedialinks( $text );
370 $text = fixImagelinks( $text );
371 return $text;
372 }
373
374 function removeTalkLink( &$text ) {
375 global $talkending;
376 return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
377 }
378
379 function fixSubpages( $text, &$title ) {
380 $old = preg_quote( $text );
381 $text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
382 "$1[[$title/$2|/$2]]", $text );
383 $text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
384 "\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
385 $text = preg_replace( "<\[\[/(.*?)\]\]>e",
386 "\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
387 return $text;
388 }
389
390 function fixImagelinks( &$text ) {
391 global $imageimport, $namespaces;
392 return preg_replace( "/$imageimport/e",
393 '"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
394 $text );
395 }
396
397 function fixMedialinks( &$text ) {
398 global $imageimport, $mediatext;
399 $text = preg_replace( "/\[$imageimport\]/e",
400 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
401 $text );
402 return preg_replace( "/\[$imageimport (.+?)\]/e",
403 '"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
404 $text );
405 }
406
407 function fetchMediaFile( $url, $filename )
408 {
409 # Copy an image file into local upload space
410 # FIXME
411 return ucfirst( $filename );
412 }
413
414 # Simple move of talk pages, etc
415 function transformTitle( $title, $dorename = false )
416 {
417 global $talkending;
418 if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
419 $thetitle = $m[1];
420 $namespace = 1;
421 } else {
422 $thetitle = $title;
423 $namespace = 0;
424 }
425 return Title::fromData( $namespace, $thetitle );
426 }
427
428 # Translated out of old usemod wiki...
429 function FreeToNormal ( $id , $FreeUpper = true ) {
430 $id = str_replace ( " ", "_", $id ) ;
431 $id = ucfirst($id);
432 if (strstr($id, '_') != false) { # Quick check for any space/underscores
433 $id = preg_replace ( '/__+/' , "_" , $id ) ;
434 $id = preg_replace ( '/^_/' , "", $id ) ;
435 $id = preg_replace ( '/_$/' , "", $id ) ;
436 #if ($UseSubpage) {
437 $id = preg_replace ( '|_/|', "/" , $id ) ;
438 $id = preg_replace ( '|/_|', "/" , $id ) ;
439 #}
440 }
441 if ($FreeUpper) {
442 # Note that letters after ' are *not* capitalized
443 if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
444 $id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
445 }
446 }
447 return $id;
448 }
449
450 # Whee!
451 function recodeInput( $text )
452 {
453 return $text;
454 }
455
456 function wfUnix2Timestamp( $unixtime ) {
457 return gmdate( "YmdHis", $timestamp );
458 }
459
460 function wfTimestamp2Unix( $ts )
461 {
462 return gmmktime( ( (int)substr( $ts, 8, 2) ),
463 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
464 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
465 (int)substr( $ts, 0, 4 ) );
466 }
467
468 ?>