4 * Import data from a UseModWiki into a PediaWiki wiki
5 * 2003-02-09 Brion VIBBER <brion@pobox.com>
6 * Based loosely on Magnus's code from 2001-2002
8 * Updated limited version to get something working temporarily
10 * Be sure to run the link & index rebuilding scripts!
12 * Some more munging for charsets etc
15 * Partial fix for pages starting with lowercase letters (??)
16 * and CamelCase and /Subpage link conversion
21 * @subpackage Maintenance
24 if( php_sapi_name() != 'cli' ) {
25 die( "Please customize the settings and run me from the command line." );
28 /** Set these correctly! */
29 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
30 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
32 /* On a large wiki, you might run out of memory */
33 @ini_set
( 'memory_limit', '40M' );
36 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
37 $FS = $wgFieldSeparator ;
42 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
48 # ------------------------------------------------------------------------------
50 function importPages()
52 global $wgRootDirectory;
55 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
56 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
57 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
58 foreach( $letters as $letter ) {
59 $dir = "$wgRootDirectory/page/$letter";
61 importPageDirectory( $dir );
65 function importPageDirectory( $dir, $prefix = "" )
67 echo "\n-- Checking page directory $dir\n";
68 $mydir = opendir( $dir );
69 while( $entry = readdir( $mydir ) ) {
70 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
71 echo importPage( $prefix . $m[1] );
73 if( is_dir( "$dir/$entry" ) ) {
74 if( $entry != '.' && $entry != '..' ) {
75 importPageDirectory( "$dir/$entry", "$entry/" );
78 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
85 # ------------------------------------------------------------------------------
88 Grab a given item from the database
90 function fetchUser( $uid )
92 die ("fetchUser not implemented" );
94 global $FS,$FS2,$FS3, $wgRootDirectory;
96 $fname = $wgRootDirectory . "/page/" . $title;
97 if( !file_exists( $fname ) ) return false;
99 $data = splitHash( implode( "", file( $fname ) ) );
105 function useModFilename( $title ) {
106 $c = substr( $title, 0, 1 );
107 if(preg_match( '/[A-Z]/i', $c ) ) {
108 return strtoupper( $c ) . "/$title";
110 return "other/$title";
113 function fetchPage( $title )
115 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
117 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
118 if( !file_exists( $fname ) ) {
119 die( "Couldn't open file '$fname' for page '$title'.\n" );
122 $page = splitHash( $FS1, file_get_contents( $fname ) );
123 $section = splitHash( $FS2, $page["text_default"] );
124 $text = splitHash( $FS3, $section["data"] );
126 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
127 "minor" => $text["minor"] , "ts" => $section["ts"] ,
128 "username" => $section["username"] , "host" => $section["host"] ) );
131 function fetchKeptPages( $title )
133 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
135 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
136 if( !file_exists( $fname ) ) return array();
138 $keptlist = explode( $FS1, file_get_contents( $fname ) );
139 array_shift( $keptlist ); # Drop the junk at beginning of file
141 $revisions = array();
142 foreach( $keptlist as $rev ) {
143 $section = splitHash( $FS2, $rev );
144 $text = splitHash( $FS3, $section["data"] );
145 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
146 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
147 "minor" => $text["minor"] , "ts" => $section["ts"] ,
148 "username" => $section["username"] , "host" => $section["host"] ) ) );
150 echo "-- skipped a bad old revision\n";
156 function splitHash ( $sep , $str ) {
157 $temp = explode ( $sep , $str ) ;
159 for ( $i = 0; $i+
1 < count ( $temp ) ; $i++
) {
160 $ret[$temp[$i]] = $temp[++
$i] ;
167 Take a fetched item and produce SQL
171 $uid is the UseMod user id number.
172 The new ones will be assigned arbitrarily and are for internal use only.
174 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
176 function importUser( $uid )
178 global $last_uid, $user_list, $wgTimestampCorrection;
179 die("importUser NYI");
182 $stuff = fetchUser( $uid );
185 $name = wfStrencode( $stuff->username
);
186 $hash = md5hash( $stuff->password
); # Doable?
187 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
188 $hideminor = ($stuff['rcall'] ?
0 : 1);
189 $options = "cols={$stuff['editcols']}
190 rows={$stuff['editrows']}
191 rcdays={$stuff['rcdays']}
192 timecorrection={$tzoffset}
193 hideminor={$hideminor}
197 INTO user (user_id,user_name,user_password,user_options)
198 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
202 function checkUserCache( $name, $host )
207 if( in_array( $name, $usercache ) ) {
208 $userid = $usercache[$name];
210 # If we haven't imported user accounts
213 $username = wfStrencode( $name );
216 $username = wfStrencode( $host );
218 return array( $userid, $username );
221 function importPage( $title )
224 global $conversiontime;
226 echo "\n-- Importing page $title\n";
227 $page = fetchPage( $title );
229 $newtitle = wfStrencode( recodeText( $title ) );
232 $munged = mungeFormat( $page->text
);
233 if( $munged != $page->text
) {
235 * Save a *new* revision with the conversion, and put the
236 * previous last version into the history.
238 $text = wfStrencode( recodeText( $munged ) );
239 $comment = "link fix";
242 $username = "Conversion script";
243 $timestamp = wfUnix2Timestamp( time() );
244 $redirect = ( preg_match( '/^#REDIRECT/', $page->text
) ?
1 : 0 );
245 $random = mt_rand() / mt_getrandmax();
247 $revisions = array( $page );
252 $text = wfStrencode( recodeText( $page->text
) );
253 $comment = wfStrencode( recodeText( $page->summary
) );
254 $minor = ($page->minor ?
1 : 0);
255 list( $userid, $username ) = checkUserCache( $page->username
, $page->host
);
256 $username = wfStrencode( recodeText( $username ) );
257 $timestamp = wfUnix2Timestamp( $page->ts
);
258 $redirect = ( preg_match( '/^#REDIRECT/', $page->text
) ?
1 : 0 );
259 $random = mt_rand() / mt_getrandmax();
261 $revisions = array();
265 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
266 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$conversiontime',$minor,$redirect,$random);\n";
269 $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
270 if(count( $revisions ) == 0 ) {
276 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit) VALUES\n";
277 foreach( $revisions as $rev ) {
278 $text = wfStrencode( recodeText( $rev->text
) );
279 $minor = ($rev->minor ?
1 : 0);
280 list( $userid, $username ) = checkUserCache( $rev->username
, $rev->host
);
281 $username = wfStrencode( recodeText( $username ) );
282 $timestamp = wfUnix2Timestamp( $rev->ts
);
283 $comment = wfStrencode( recodeText( $rev->summary
) );
285 if($any) $sql .= ",";
286 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)";
294 function recodeText( $string ) {
295 global $wgImportEncoding;
296 # For currently latin-1 wikis
297 $string = str_replace( "\r\n", "\n", $string );
298 $string = @iconv
( $wgImportEncoding, "UTF-8", $string );
299 $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff
303 function wfUtf8Sequence($codepoint) {
304 if($codepoint < 0x80) return chr($codepoint);
305 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f |
0xc0) .
306 chr($codepoint & 0x3f |
0x80);
307 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f |
0xe0) .
308 chr($codepoint >> 6 & 0x3f |
0x80) .
309 chr($codepoint & 0x3f |
0x80);
310 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 |
0xf0) . # Double-check this
311 chr($codepoint >> 12 & 0x3f |
0x80) .
312 chr($codepoint >> 6 & 0x3f |
0x80) .
313 chr($codepoint & 0x3f |
0x80);
314 # Doesn't yet handle outside the BMP
315 return "&#$codepoint;";
318 function wfMungeToUtf8($string) {
319 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
320 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
321 # Should also do named entities here
325 function wfStrencode( $string ) {
326 return mysql_escape_string( $string );
329 function wfUnix2Timestamp( $unixtime ) {
330 return gmdate( "YmdHis", $unixtime );
333 function wfTimestamp2Unix( $ts )
335 return gmmktime( ( (int)substr( $ts, 8, 2) ),
336 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
337 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
338 (int)substr( $ts, 0, 4 ) );
341 function wfTimestampNow() {
343 return gmdate( "YmdHis" );
346 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
347 function wfInvertTimestamp( $ts ) {
355 function wfSeedRandom()
357 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
359 $wgRandomSeeded = true;
362 function array2object( $arr ) {
364 foreach( $arr as $x => $y ) {
372 * Make CamelCase and /Talk links work
374 function mungeFormat( $text ) {
377 $staged = preg_replace_callback(
378 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
379 'nowikiPlaceholder', $text );
381 # This is probably not 100% correct, I'm just
382 # glancing at the UseModWiki code.
384 $lower = "[a-z_0-9]";
385 $any = "[A-Za-z_0-9]";
386 $camel = "(?:$upper+$lower+$upper+$any*)";
387 $subpage = "(?:\\/$any+)";
388 $substart = "(?:\\/$upper$any*)";
390 $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
393 $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
394 'array_shift( $nowiki )', $munged );
399 function placeholder( $x = null ) {
400 return '\xffplaceholder\xff';
403 function nowikiPlaceholder( $matches ) {
405 $nowiki[] = $matches[1];
406 return placeholder();