--redirects option
[lhc/web/wiklou.git] / maintenance / dumpHTML.inc
1 <?php
2 /**
3 * @package MediaWiki
4 * @subpackage Maintenance
5 */
6
7 define( 'REPORTING_INTERVAL', 10 );
8
9 require_once( 'includes/ImagePage.php' );
10 require_once( 'includes/CategoryPage.php' );
11
12 class DumpHTML {
13 # Destination directory
14 var $dest;
15
16 # Show interlanguage links?
17 var $interwiki = true;
18
19 # Depth of HTML directory tree
20 var $depth = 3;
21
22 # Directory that commons images are copied into
23 var $sharedStaticPath;
24
25 # Relative path to image directory
26 var $imageRel = 'upload';
27
28 # Copy commons images instead of symlinking
29 var $forceCopy = false;
30
31 # Make links assuming the script path is in the same directory as
32 # the destination
33 var $alternateScriptPath = false;
34
35 function DumpHTML( $settings ) {
36 foreach ( $settings as $var => $value ) {
37 $this->$var = $value;
38 }
39 }
40
41 /**
42 * Write a set of articles specified by start and end page_id
43 * Skip categories and images, they will be done separately
44 */
45 function doArticles( $start, $end = false ) {
46 $fname = 'DumpHTML::doArticles';
47
48 $this->setupGlobals();
49
50 if ( $end === false ) {
51 $dbr =& wfGetDB( DB_SLAVE );
52 $end = $dbr->selectField( 'page', 'max(page_id)', false, $fname );
53 }
54
55
56 for ($id = $start; $id <= $end; $id++) {
57 wfWaitForSlaves( 20 );
58 if ( !($id % REPORTING_INTERVAL) ) {
59 print "Processing ID: $id\r";
60 }
61 if ( !($id % (REPORTING_INTERVAL*10) ) ) {
62 print "\n";
63 }
64 $title = Title::newFromID( $id );
65 if ( $title ) {
66 $ns = $title->getNamespace() ;
67 if ( $ns != NS_CATEGORY ) {
68 $this->doArticle( $title );
69 }
70 }
71 }
72 print "\n";
73 }
74
75 function doSpecials() {
76 $this->doMainPage();
77
78 $this->setupGlobals();
79 print "Special:Categories...";
80 $this->doArticle( Title::makeTitle( NS_SPECIAL, 'Categories' ) );
81 print "\n";
82 }
83
84 /** Write the main page as index.html */
85 function doMainPage() {
86 global $wgMakeDumpLinks;
87
88 print "Making index.html ";
89
90 // Set up globals with no ../../.. in the link URLs
91 $this->setupGlobals( 0 );
92
93 // But still use that directory style
94 $wgMakeDumpLinks = 3;
95
96 $title = Title::newMainPage();
97 $text = $this->getArticleHTML( $title );
98 $file = fopen( "{$this->dest}/index.html", "w" );
99 if ( !$file ) {
100 print "\nCan't open index.html for writing\n";
101 return false;
102 }
103 fwrite( $file, $text );
104 fclose( $file );
105 print "\n";
106 }
107
108 function doImageDescriptions() {
109 global $wgSharedUploadDirectory;
110
111 $fname = 'DumpHTML::doImageDescriptions';
112
113 $this->setupGlobals( 3 );
114
115 /**
116 * Dump image description pages that don't have an associated article, but do
117 * have a local image
118 */
119 $dbr =& wfGetDB( DB_SLAVE );
120 extract( $dbr->tableNames( 'image', 'page' ) );
121 $res = $dbr->select( 'image', array( 'img_name' ), false, $fname );
122
123 $i = 0;
124 print "Writing image description pages for local images\n";
125 $num = $dbr->numRows( $res );
126 while ( $row = $dbr->fetchObject( $res ) ) {
127 wfWaitForSlaves( 10 );
128 if ( !( ++$i % REPORTING_INTERVAL ) ) {
129 print "Done $i of $num\r";
130 }
131 $title = Title::makeTitle( NS_IMAGE, $row->img_name );
132 if ( $title->getArticleID() ) {
133 // Already done by dumpHTML
134 continue;
135 }
136 $this->doArticle( $title );
137 }
138 print "\n";
139
140 /**
141 * Dump images which only have a real description page on commons
142 */
143 print "Writing description pages for commons images\n";
144 $i = 0;
145 for ( $hash = 0; $hash < 256; $hash++ ) {
146 $dir = sprintf( "%01x/%02x", intval( $hash / 16 ), $hash );
147 $paths = array_merge( glob( "{$this->sharedStaticPath}/$dir/*" ),
148 glob( "{$this->sharedStaticPath}/thumb/$dir/*" ) );
149
150 foreach ( $paths as $path ) {
151 $file = basename( $path );
152 if ( !(++$i % REPORTING_INTERVAL ) ) {
153 print "$i\r";
154 }
155
156 $title = Title::makeTitle( NS_IMAGE, $file );
157 $this->doArticle( $title );
158 }
159 }
160 print "\n";
161 }
162
163 function doCategories() {
164 $fname = 'DumpHTML::doCategories';
165 $this->setupGlobals();
166
167 $dbr =& wfGetDB( DB_SLAVE );
168 $categorylinks = $dbr->tableName( 'categorylinks' );
169 print "Selecting categories...";
170 $sql = 'SELECT DISTINCT cl_to FROM categorylinks';
171 $res = $dbr->query( $sql, $fname );
172
173 print "\nWriting " . $dbr->numRows( $res ). " category pages\n";
174 $i = 0;
175 while ( $row = $dbr->fetchObject( $res ) ) {
176 wfWaitForSlaves( 10 );
177 if ( !(++$i % REPORTING_INTERVAL ) ) {
178 print "$i\r";
179 }
180 $title = Title::makeTitle( NS_CATEGORY, $row->cl_to );
181 $this->doArticle( $title );
182 }
183 print "\n";
184 }
185
186 function doRedirects() {
187 global $wgLinkCache;
188
189 print "Doing redirects...\n";
190 $fname = 'DumpHTML::doRedirects';
191 $this->setupGlobals();
192 $dbr =& wfGetDB( DB_SLAVE );
193
194 $res = $dbr->select( 'page', array( 'page_namespace', 'page_title' ),
195 array( 'page_is_redirect' => 1 ), $fname );
196 $num = $dbr->numRows( $res );
197 print "$num redirects to do...\n";
198 $i = 0;
199 while ( $row = $dbr->fetchObject( $res ) ) {
200 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
201 if ( !(++$i % (REPORTING_INTERVAL*10) ) ) {
202 print "Done $i of $num\n";
203 }
204 $this->doArticle( $title );
205 }
206 }
207
208 /** Write an article specified by title */
209 function doArticle( $title ) {
210 global $wgTitle, $wgSharedUploadPath, $wgSharedUploadDirectory;
211 global $wgUploadDirectory;
212
213 $text = $this->getArticleHTML( $title );
214
215 if ( $text === false ) {
216 return;
217 }
218
219 # Parse the XHTML to find the images
220 $images = $this->findImages( $text );
221 $this->copyImages( $images );
222
223 # Write to file
224 $this->writeArticle( $title, $text );
225 }
226
227 /** Write the given text to the file identified by the given title object */
228 function writeArticle( &$title, $text ) {
229 $filename = $title->getHashedFilename();
230 $fullName = "{$this->dest}/$filename";
231 $fullDir = dirname( $fullName );
232
233 wfMkdirParents( $fullDir, 0755 );
234
235 $file = fopen( $fullName, 'w' );
236 if ( !$file ) {
237 print("Can't open file $fullName for writing\n");
238 return;
239 }
240
241 fwrite( $file, $text );
242 fclose( $file );
243 }
244
245 /** Set up globals required for parsing */
246 function setupGlobals( $depth = NULL ) {
247 global $wgUser, $wgTitle, $wgMakeDumpLinks, $wgStylePath, $wgArticlePath;
248 global $wgUploadPath, $wgLogo, $wgMaxCredits, $wgSharedUploadPath;
249 global $wgHideInterlanguageLinks, $wgUploadDirectory, $wgThumbnailScriptPath;
250 global $wgSharedThumbnailScriptPath, $wgEnableParserCache;
251
252 static $oldLogo = NULL;
253
254 if ( is_null( $depth ) ) {
255 $wgMakeDumpLinks = $this->depth;
256 } else {
257 $wgMakeDumpLinks = $depth;
258 }
259
260 if ( $this->alternateScriptPath ) {
261 if ( $wgMakeDumpLinks == 0 ) {
262 $wgScriptPath = '.';
263 } else {
264 $wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks - 1 );
265 }
266 } else {
267 $wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks );
268 }
269
270 $wgArticlePath = str_repeat( '../', $wgMakeDumpLinks ) . '$1';
271
272 # Logo image
273 # Allow for repeated setup
274 if ( !is_null( $oldLogo ) ) {
275 $wgLogo = $oldLogo;
276 } else {
277 $oldLogo = $wgLogo;
278 }
279
280 if ( strpos( $wgLogo, $wgUploadPath ) === 0 ) {
281 # If it's in the upload directory, rewrite it to the new upload directory
282 $wgLogo = "$wgScriptPath/{$this->imageRel}/" . substr( $wgLogo, strlen( $wgUploadPath ) + 1 );
283 } elseif ( $wgLogo{0} == '/' ) {
284 # This is basically heuristic
285 # Rewrite an absolute logo path to one relative to the the script path
286 $wgLogo = $wgScriptPath . $wgLogo;
287 }
288
289 $wgStylePath = "$wgScriptPath/skins";
290 $wgUploadPath = "$wgScriptPath/{$this->imageRel}";
291 $wgSharedUploadPath = "$wgUploadPath/shared";
292 $wgMaxCredits = -1;
293 $wgHideInterlangageLinks = !$this->interwiki;
294 $wgThumbnailScriptPath = $wgSharedThumbnailScriptPath = false;
295 $wgEnableParserCache = false;
296 $wgMathPath = "$wgScriptPath/math";
297
298 $wgUser = new User;
299 $wgUser->setOption( 'skin', 'htmldump' );
300 $wgUser->setOption( 'editsection', 0 );
301
302 $this->sharedStaticPath = "$wgUploadDirectory/shared";
303
304 }
305
306 /** Reads the content of a title object, executes the skin and captures the result */
307 function getArticleHTML( &$title ) {
308 global $wgOut, $wgTitle, $wgArticle, $wgUser, $wgUseCategoryMagic, $wgLinkCache;
309
310 $wgTitle = $title;
311 if ( is_null( $wgTitle ) ) {
312 return false;
313 }
314
315 $ns = $wgTitle->getNamespace();
316 if ( $ns == NS_SPECIAL ) {
317 SpecialPage::executePath( $wgTitle );
318 } else {
319 if ( $ns == NS_IMAGE ) {
320 $wgArticle = new ImagePage( $wgTitle );
321 } elseif ( $wgUseCategoryMagic && $ns == NS_CATEGORY ) {
322 $wgArticle = new CategoryPage( $wgTitle );
323 } else {
324 $wgArticle = new Article( $wgTitle );
325 }
326 $rt = Title::newFromRedirect( $wgArticle->fetchContent() );
327 if ( $rt != NULL ) {
328 return $this->getRedirect( $rt );
329 } else {
330 $wgOut = new OutputPage;
331 $wgOut->setParserOptions( new ParserOptions );
332 $wgLinkCache = new LinkCache;
333
334 $wgArticle->view();
335 }
336 }
337
338 $sk =& $wgUser->getSkin();
339 ob_start();
340 $sk->outputPage( $wgOut );
341 $text = ob_get_contents();
342 ob_end_clean();
343
344 return $text;
345 }
346
347 function getRedirect( $rt ) {
348 $url = $rt->escapeLocalURL();
349 $text = $rt->getPrefixedText();
350 return <<<ENDTEXT
351 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
352 <html xmlns="http://www.w3.org/1999/xhtml">
353 <head>
354 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
355 <meta http-equiv="Refresh" content="0;url=$url" />
356 </head>
357 <body>
358 <p>Redirecting to <a href="$url">$text</a></p>
359 </body>
360 </html>
361 ENDTEXT;
362 }
363
364 /** Returns image paths used in an XHTML document */
365 function findImages( $text ) {
366 global $wgOutputEncoding, $wgDumpImages;
367 $parser = xml_parser_create( $wgOutputEncoding );
368 xml_set_element_handler( $parser, 'wfDumpStartTagHandler', 'wfDumpEndTagHandler' );
369
370 $wgDumpImages = array();
371 xml_parse( $parser, $text );
372 xml_parser_free( $parser );
373
374 return $wgDumpImages;
375 }
376
377 /**
378 * Copy images (or create symlinks) from commons to a static directory.
379 * This is necessary even if you intend to distribute all of commons, because
380 * the directory contents is used to work out which image description pages
381 * are needed.
382 *
383 * Also copies math images
384 *
385 */
386 function copyImages( $images ) {
387 global $wgSharedUploadPath, $wgSharedUploadDirectory, $wgMathPath, $wgMathDirectory;
388 # Find shared uploads and copy them into the static directory
389 $sharedPathLength = strlen( $wgSharedUploadPath );
390 $mathPathLength = strlen( $wgMathPath );
391 foreach ( $images as $escapedImage => $dummy ) {
392 $image = urldecode( $escapedImage );
393
394 # Is it shared?
395 if ( substr( $image, 0, $sharedPathLength ) == $wgSharedUploadPath ) {
396 # Reconstruct full filename
397 $rel = substr( $image, $sharedPathLength + 1 ); // +1 for slash
398 $sourceLoc = "$wgSharedUploadDirectory/$rel";
399 $staticLoc = "{$this->sharedStaticPath}/$rel";
400 #print "Copying $sourceLoc to $staticLoc\n";
401 # Copy to static directory
402 if ( !file_exists( $staticLoc ) ) {
403 wfMkdirParents( dirname( $staticLoc ), 0755 );
404 if ( function_exists( 'symlink' ) && !$this->forceCopy ) {
405 symlink( $sourceLoc, $staticLoc );
406 } else {
407 copy( $sourceLoc, $staticLoc );
408 }
409 }
410
411 if ( substr( $rel, 0, 6 ) == 'thumb/' ) {
412 # That was a thumbnail
413 # We will also copy the real image
414 $parts = explode( '/', $rel );
415 $rel = "{$parts[1]}/{$parts[2]}/{$parts[3]}";
416 $sourceLoc = "$wgSharedUploadDirectory/$rel";
417 $staticLoc = "{$this->sharedStaticPath}/$rel";
418 #print "Copying $sourceLoc to $staticLoc\n";
419 if ( !file_exists( $staticLoc ) ) {
420 wfMkdirParents( dirname( $staticLoc ), 0755 );
421 if ( function_exists( 'symlink' ) && !$this->forceCopy ) {
422 symlink( $sourceLoc, $staticLoc );
423 } else {
424 copy( $sourceLoc, $staticLoc );
425 }
426 }
427 }
428 } else
429 # Is it math?
430 if ( substr( $image, 0, $mathPathLength ) == $wgMathPath ) {
431 $rel = substr( $image, $mathPathLength + 1 ); // +1 for slash
432 $source = "$wgMathDirectory/$rel";
433 $dest = "{$this->dest}/math/$rel";
434 @mkdir( "{$this->dest}/math", 0755 );
435 if ( !file_exists( $dest ) ) {
436 copy( $source, $dest );
437 }
438 }
439 }
440 }
441 }
442
443 /** XML parser callback */
444 function wfDumpStartTagHandler( $parser, $name, $attribs ) {
445 global $wgDumpImages;
446
447 if ( $name == 'IMG' && isset( $attribs['SRC'] ) ) {
448 $wgDumpImages[$attribs['SRC']] = true;
449 }
450 }
451
452 /** XML parser callback */
453 function wfDumpEndTagHandler( $parser, $name ) {}
454
455 # vim: syn=php
456 ?>