Replace spaces with tabs
[lhc/web/wiklou.git] / maintenance / importUseModWiki.php
1 <?php
2 /**
3 * Import data from a UseModWiki into a MediaWiki wiki
4 * 2003-02-09 Brion VIBBER <brion@pobox.com>
5 * Based loosely on Magnus's code from 2001-2002
6 *
7 * Updated limited version to get something working temporarily
8 * 2003-10-09
9 * Be sure to run the link & index rebuilding scripts!
10 *
11 * Some more munging for charsets etc
12 * 2003-11-28
13 *
14 * Partial fix for pages starting with lowercase letters (??)
15 * and CamelCase and /Subpage link conversion
16 * 2004-11-17
17 *
18 * Rewrite output to create Special:Export format for import
19 * instead of raw SQL. Should be 'future-proof' against future
20 * schema changes.
21 * 2005-03-14
22 *
23 * This program is free software; you can redistribute it and/or modify
24 * it under the terms of the GNU General Public License as published by
25 * the Free Software Foundation; either version 2 of the License, or
26 * (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public License along
34 * with this program; if not, write to the Free Software Foundation, Inc.,
35 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
36 * http://www.gnu.org/copyleft/gpl.html
37 *
38 * @todo document
39 * @file
40 * @ingroup Maintenance
41 */
42
43 require_once( "Maintenance.php" );
44
45 class ImportUseModWiki extends Maintenance {
46
47 private $encoding, $rootDirectory = '';
48
49 /**
50 * Field separators
51 * @var String
52 */
53 private $FS1, $FS2, $FS3 = '';
54
55 /**
56 * @var Array
57 */
58 private $usercache, $nowiki = array();
59
60 public function __construct() {
61 parent::__construct();
62 $this->mDescription = "Import pages from UseMod wikis";
63 $this->addOption( 'encoding', 'Encoding of the imported text, default CP1252', false, true );
64 /**
65 * If UseModWiki's New File System is used:
66 * $NewFS = 1; # 1 = new multibyte $FS, 0 = old $FS
67 * Use "\xb3"; for the Old File System
68 * Changed with UTF-8 UseModWiki
69 * http://www.usemod.com/cgi-bin/wiki.pl?SupportForUtf8
70 * http://www.usemod.com/cgi-bin/wiki.pl?WikiBugs/NewFieldSeparatorWronglyTreated
71 * http://www.meatballwiki.org/wiki/WikiEngine#Q_amp_A
72 */
73 $this->addOption( 'separator', 'Field separator to use, default \x1E\xFF\xFE\x1E', false, true );
74 $this->addArg( 'path', 'Path to your UseMod wiki' );
75 }
76
77 public function execute() {
78 $this->rootDirectory = $this->getArg();
79 $this->encoding = $this->getOption( 'encoding', 'CP1252' );
80 $sep = $this->getOption( 'separator', "\x1E\xFF\xFE\x1E" );
81 $this->FS1 = "{$sep}1";
82 $this->FS2 = "{$sep}2";
83 $this->FS3 = "{$sep}3";
84
85 echo <<<XML
86 <?xml version="1.0" encoding="UTF-8" ?>
87 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
88 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
89 xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
90 http://www.mediawiki.org/xml/export-0.1.xsd"
91 version="0.1"
92 xml:lang="en">
93 <!-- generated by importUseModWiki.php -->
94
95 XML;
96 $letters = array(
97 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
98 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
99 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
100 foreach ( $letters as $letter ) {
101 $dir = "{$this->rootDirectory}/page/$letter";
102 if ( is_dir( $dir ) )
103 $this->importPageDirectory( $dir );
104 }
105 echo <<<XML
106 </mediawiki>
107
108 XML;
109 }
110
111 private function importPageDirectory( $dir, $prefix = "" ) {
112 echo "\n<!-- Checking page directory " . $this->xmlCommentSafe( $dir ) . " -->\n";
113 $mydir = opendir( $dir );
114 while ( $entry = readdir( $mydir ) ) {
115 $m = array();
116 if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
117 echo $this->importPage( $prefix . $m[1] );
118 } else {
119 if ( is_dir( "$dir/$entry" ) ) {
120 if ( $entry != '.' && $entry != '..' ) {
121 $this->importPageDirectory( "$dir/$entry", "$entry/" );
122 }
123 } else {
124 echo "<!-- File '" . $this->xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
125 }
126 }
127 }
128 }
129
130 private function useModFilename( $title ) {
131 $c = substr( $title, 0, 1 );
132 if ( preg_match( '/[A-Z]/i', $c ) ) {
133 return strtoupper( $c ) . "/$title";
134 }
135 return "other/$title";
136 }
137
138 private function fetchPage( $title ) {
139 $fname = $this->rootDirectory . "/page/" . $this->useModFilename( $title ) . ".db";
140 if ( !file_exists( $fname ) ) {
141 echo "Couldn't open file '$fname' for page '$title'.\n";
142 die( -1 );
143 }
144
145 $page = $this->splitHash( $this->FS1, file_get_contents( $fname ) );
146 $section = $this->splitHash( $this->FS2, $page["text_default"] );
147 $text = $this->splitHash( $this->FS3, $section["data"] );
148
149 return $this->array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
150 "minor" => $text["minor"] , "ts" => $section["ts"] ,
151 "username" => $section["username"] , "host" => $section["host"] ) );
152 }
153
154 private function fetchKeptPages( $title ) {
155 $fname = $this->rootDirectory . "/keep/" . $this->useModFilename( $title ) . ".kp";
156 if ( !file_exists( $fname ) ) return array();
157
158 $keptlist = explode( $this->FS1, file_get_contents( $fname ) );
159 array_shift( $keptlist ); # Drop the junk at beginning of file
160
161 $revisions = array();
162 foreach ( $keptlist as $rev ) {
163 $section = $this->splitHash( $this->FS2, $rev );
164 $text = $this->splitHash( $this->FS3, $section["data"] );
165 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) {
166 array_push( $revisions, $this->array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
167 "minor" => $text["minor"] , "ts" => $section["ts"] ,
168 "username" => $section["username"] , "host" => $section["host"] ) ) );
169 } else {
170 echo "<!-- skipped a bad old revision -->\n";
171 }
172 }
173 return $revisions;
174 }
175
176 private function splitHash( $sep , $str ) {
177 $temp = explode ( $sep , $str ) ;
178 $ret = array () ;
179 for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) {
180 $ret[$temp[$i]] = $temp[++$i] ;
181 }
182 return $ret ;
183 }
184
185 private function checkUserCache( $name, $host ) {
186 if ( $name ) {
187 if ( in_array( $name, $this->usercache ) ) {
188 $userid = $this->usercache[$name];
189 } else {
190 # If we haven't imported user accounts
191 $userid = 0;
192 }
193 $username = str_replace( '_', ' ', $name );
194 } else {
195 $userid = 0;
196 $username = $host;
197 }
198 return array( $userid, $username );
199 }
200
201 private function importPage( $title ) {
202 echo "\n<!-- Importing page " . $this->xmlCommentSafe( $title ) . " -->\n";
203 $page = $this->fetchPage( $title );
204
205 $newtitle = $this->xmlsafe( str_replace( '_', ' ', $this->recodeText( $title ) ) );
206
207 $munged = $this->mungeFormat( $page->text );
208 if ( $munged != $page->text ) {
209 /**
210 * Save a *new* revision with the conversion, and put the
211 * previous last version into the history.
212 */
213 $next = $this->array2object( array(
214 'text' => $munged,
215 'minor' => 1,
216 'username' => 'Conversion script',
217 'host' => '127.0.0.1',
218 'ts' => time(),
219 'summary' => 'link fix',
220 ) );
221 $revisions = array( $page, $next );
222 } else {
223 /**
224 * Current revision:
225 */
226 $revisions = array( $page );
227 }
228 $xml = <<<XML
229 <page>
230 <title>$newtitle</title>
231
232 XML;
233
234 # History
235 $revisions = array_merge( $revisions, $this->fetchKeptPages( $title ) );
236 if ( count( $revisions ) == 0 ) {
237 return NULL; // Was "$sql", which does not appear to be defined.
238 }
239
240 foreach ( $revisions as $rev ) {
241 $text = $this->xmlsafe( $this->recodeText( $rev->text ) );
242 $minor = ( $rev->minor ? '<minor/>' : '' );
243 list( /* $userid */ , $username ) = $this->checkUserCache( $rev->username, $rev->host );
244 $username = $this->xmlsafe( $this->recodeText( $username ) );
245 $timestamp = $this->xmlsafe( $this->timestamp2ISO8601( $rev->ts ) );
246 $comment = $this->xmlsafe( $this->recodeText( $rev->summary ) );
247
248 $xml .= <<<XML
249 <revision>
250 <timestamp>$timestamp</timestamp>
251 <contributor><username>$username</username></contributor>
252 $minor
253 <comment>$comment</comment>
254 <text>$text</text>
255 </revision>
256
257 XML;
258 }
259 $xml .= "</page>\n\n";
260 return $xml;
261 }
262
263 private function recodeText( $string ) {
264 # For currently latin-1 wikis
265 $string = str_replace( "\r\n", "\n", $string );
266 $string = @iconv( $this->encoding, "UTF-8", $string );
267 $string = $this->mungeToUtf8( $string ); # Any old &#1234; stuff
268 return $string;
269 }
270
271 /**
272 * @todo FIXME: Don't use /e
273 */
274 private function mungeToUtf8( $string ) {
275 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
276 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
277 # Should also do named entities here
278 return $string;
279 }
280
281 private function timestamp2ISO8601( $ts ) {
282 # 2003-08-05T18:30:02Z
283 return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
284 }
285
286 /**
287 * The page may contain old data which has not been properly normalized.
288 * Invalid UTF-8 sequences or forbidden control characters will make our
289 * XML output invalid, so be sure to strip them out.
290 * @param String $string Text to clean up
291 * @return String
292 */
293 private function xmlsafe( $string ) {
294 $string = UtfNormal::cleanUp( $string );
295 $string = htmlspecialchars( $string );
296 return $string;
297 }
298
299 private function xmlCommentSafe( $text ) {
300 return str_replace( '--', '\\-\\-', $this->xmlsafe( $this->recodeText( $text ) ) );
301 }
302
303 private function array2object( $arr ) {
304 $o = (object)0;
305 foreach ( $arr as $x => $y ) {
306 $o->$x = $y;
307 }
308 return $o;
309 }
310
311 /**
312 * Make CamelCase and /Talk links work
313 */
314 private function mungeFormat( $text ) {
315 $this->nowiki = array();
316 $staged = preg_replace_callback(
317 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
318 array( $this, 'nowikiPlaceholder' ), $text );
319
320 # This is probably not 100% correct, I'm just
321 # glancing at the UseModWiki code.
322 $upper = "[A-Z]";
323 $lower = "[a-z_0-9]";
324 $any = "[A-Za-z_0-9]";
325 $camel = "(?:$upper+$lower+$upper+$any*)";
326 $subpage = "(?:\\/$any+)";
327 $substart = "(?:\\/$upper$any*)";
328
329 $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
330 '[[$1]]', $staged );
331
332 $final = preg_replace( '/' . preg_quote( $this->placeholder() ) . '/s',
333 array( $this, 'nowikiShift' ), $munged );
334 return $final;
335 }
336
337 private function placeholder( $x = null ) {
338 return '\xffplaceholder\xff';
339 }
340
341 public function nowikiPlaceholder( $matches ) {
342 $this->nowiki[] = $matches[1];
343 return $this->placeholder();
344 }
345
346 public function nowikiShift() {
347 return array_shift( $this->nowiki );
348 }
349 }
350
351 function wfUtf8Sequence( $codepoint ) {
352 if ( $codepoint < 0x80 ) {
353 return chr( $codepoint );
354 }
355 if ( $codepoint < 0x800 ) {
356 return chr( $codepoint >> 6 & 0x3f | 0xc0 ) .
357 chr( $codepoint & 0x3f | 0x80 );
358 }
359 if ( $codepoint < 0x10000 ) {
360 return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
361 chr( $codepoint >> 6 & 0x3f | 0x80 ) .
362 chr( $codepoint & 0x3f | 0x80 );
363 }
364 if ( $codepoint < 0x100000 ) {
365 return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this
366 chr( $codepoint >> 12 & 0x3f | 0x80 ) .
367 chr( $codepoint >> 6 & 0x3f | 0x80 ) .
368 chr( $codepoint & 0x3f | 0x80 );
369 }
370 # Doesn't yet handle outside the BMP
371 return "&#$codepoint;";
372 }
373
374 $maintClass = 'ImportUseModWiki';
375 require_once( RUN_MAINTENANCE_IF_MAIN );