From fa093edfc82b7c979a426a7a27bcc6b18512da04 Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Sat, 28 May 2005 07:05:28 +0000 Subject: [PATCH] Many changes. Work in progress, do not use. --- maintenance/dumpHTML.inc | 378 +++++++++++++++++++++++++++++++-------- maintenance/dumpHTML.php | 41 ++++- 2 files changed, 339 insertions(+), 80 deletions(-) diff --git a/maintenance/dumpHTML.inc b/maintenance/dumpHTML.inc index caf2763a35..7abdb1b9de 100644 --- a/maintenance/dumpHTML.inc +++ b/maintenance/dumpHTML.inc @@ -6,94 +6,177 @@ define( 'REPORTING_INTERVAL', 10 ); -function dumpHTML( $dest, $start ) { - global $wgUser, $wgTitle, $wgArticle, $wgEnablePersistentLC, $wgLinkCache, $wgOut; - global $wgMakeDumpLinks, $wgStylePath, $wgArticlePath, $wgUploadPath, $wgLogo; - $wgMakeDumpLinks = true; - $wgScriptPath = "../../.."; - $wgStylePath = "$wgScriptPath/skins"; - $wgUploadPath = "$wgScriptPath/images"; - $wgLogo = "$wgStylePath/common/images/wiki.png"; - $wgArticlePath = '../../$1'; - $dbr =& wfGetDB( DB_SLAVE ); - $end = $dbr->selectField( 'page', 'max(page_id)', false ); - - /*global $wgValidSkinNames; - var_dump( $wgValidSkinNames ); - exit;*/ - - print("Creating static HTML dump. Starting from page_id $start of $end.\n"); - - $wgUser = new User; - $wgUser->setOption( 'skin', 'htmldump' ); - $sk =& $wgUser->getSkin(); - - if ( !is_dir( $dest ) ) { - if ( !mkdir( $dest, 0755 ) ) { - print("Can't make directory $dir, exiting\n"); - return; - } +require_once( 'includes/ImagePage.php' ); +require_once( 'includes/CategoryPage.php' ); + +class DumpHTML { + var $dest, $interwiki, $depth, $sharedStaticPath; + + function DumpHTML( $dest, $interwiki = true, $depth = 3 ) { + $this->dest = $dest; + $this->interwiki = $interwiki; + $this->depth = $depth; } - - for ($id = $start; $id <= $end; $id++) { - if ( !($id % REPORTING_INTERVAL) ) { - print("$id\n"); - } + + /** + * Write a set of articles specified by start and end page_id + * Skip categories and images, they will be done separately + */ + function doArticles( $start, $end = false ) { + $fname = 'DumpHTML::doArticles'; - $wgOut = new OutputPage; - $wgOut->setArticleFlag( true ); - $wgOut->setRobotpolicy( 'index,follow' ); + $this->setupGlobals(); - $wgTitle = Title::newFromID( $id ); - if ( is_null( $wgTitle ) ) { - continue; + if ( $end === false ) { + $dbr =& wfGetDB( DB_SLAVE ); + $end = $dbr->selectField( 'page', 'max(page_id)', false, $fname ); } - $wgArticle = new Article( $wgTitle ); - $text = $wgArticle->getContent( true ); - $wgLinkCache = new LinkCache; - $wgLinkCache->forUpdate( true ); - global $wgLinkHolders; - $wgLinkHolders = array( - 'namespaces' => array(), - 'dbkeys' => array(), - 'queries' => array(), - 'texts' => array(), - 'titles' => array() - ); - - - # Parse the text and replace links with placeholders - $wgOut->setPageTitle( $wgTitle->getPrefixedText() ); - $wgOut->addWikiText( $text ); - $wgOut->transformBuffer(); + for ($id = $start; $id <= $end; $id++) { + if ( !($id % REPORTING_INTERVAL) ) { + print("$id\n"); + } + $title = Title::newFromID( $id ); + if ( $title ) { + $ns = $title->getNamespace() ; + if ( $ns != NS_CATEGORY && $ns != NS_IMAGE ) { + $this->doArticle( $title ); + } + } + } + } + + function doSpecials() { + $this->doMainPage(); + + $this->setupGlobals(); + print "Special:Categories..."; + $this->doArticle( Title::makeTitle( NS_SPECIAL, 'Categories' ) ); + print "\n"; + } + + /** Write the main page as index.html */ + function doMainPage() { + global $wgMakeDumpLinks; + + print "Making index.html "; + + // Set up globals with no ../../.. in the link URLs + $this->setupGlobals( 0 ); + + // But still use that directory style + $wgMakeDumpLinks = 3; - # Execute skin to get complete HTML - ob_start(); - $sk->outputPage( $wgOut ); - $text = ob_get_contents(); - ob_end_clean(); + $title = Title::newMainPage(); + $text = $this->getArticleHTML( $title ); + $file = fopen( "{$this->dest}/index.html", "w" ); + if ( !$file ) { + print "\nCan't open index.html for writing\n"; + return false; + } + fwrite( $file, $text ); + fclose( $file ); + print "\n"; + } + + function doImageDescriptions() { + global $wgSharedUploadDirectory; - # Write to file - $fname = $wgTitle->getHashedFilename(); - $bits = explode( '/', $fname ); - $parentDir = "$dest/{$bits[0]}"; - $fullDir = "$dest/{$bits[0]}/{$bits[1]}"; - $fullName = "$dest/$fname"; - - if ( !is_dir( $parentDir ) ) { - if ( !mkdir( $parentDir, 0744 ) ) { - print("Can't write to directory $parentDir\n"); - return; + $fname = 'DumpHTML::doImageDescriptions'; + + $this->setupGlobals( 3 ); + + /** + * Dump image description pages that don't have an associated article, but do + * have a local image + */ + $dbr =& wfGetDB( DB_SLAVE ); + extract( $dbr->tableNames( 'image', 'page' ) ); + $res = $dbr->select( 'image', array( 'img_name' ), false, $fname ); + + $i = 0; + print "Writing " . $dbr->numRows( $res ) . " image description pages for local images\n"; + while ( $row = $dbr->fetchObject( $res ) ) { + if ( !( ++$i % REPORTING_INTERVAL ) ) { + print "$i\t{$row->img_name}\n"; + } + $title = Title::makeTitle( NS_IMAGE, $row->img_name ); + if ( $title->getArticleID() ) { + // Already done by dumpHTML + continue; + } + $this->doArticle( $title ); + } + /** + * Dump images which only have a real description page on commons + */ + print "Writing description pages for commons images\n"; + $i = 0; + for ( $hash = 0; $hash < 256; $hash++ ) { + $dir = sprintf( "%01x/%02x", intval( $hash / 16 ), $hash ); + $paths = glob( "{$this->sharedStaticPath}/$dir/*" ); + $paths += glob( "{$this->sharedStaticPath}/thumb/$dir/*" ); + + foreach ( $paths as $path ) { + $file = basename( $path ); + if ( !(++$i % REPORTING_INTERVAL ) ) { + print "$i\t$file\n"; + } + + $title = Title::makeTitle( NS_IMAGE, $file ); + $this->doArticle( $title ); } } - if ( !is_dir( $fullDir ) ) { - if ( !mkdir( $fullDir, 0744 ) ) { - print("Can't write to directory $fullDir\n"); - return; + } + + function doCategories() { + $fname = 'DumpHTML::doCategories'; + $this->setupGlobals(); + + $dbr =& wfGetDB( DB_SLAVE ); + $categorylinks = $dbr->tableName( 'categorylinks' ); + print "Selecting categories..."; + $sql = 'SELECT DISTINCT cl_to FROM categorylinks'; + $res = $dbr->query( $sql, $fname ); + + print "\nWriting " . $dbr->numRows( $res ). " category pages\n"; + $i = 0; + while ( $row = $dbr->fetchObject( $res ) ) { + if ( !(++$i % REPORTING_INTERVAL ) ) { + print "$i\t{$row->cl_to}\n"; } + $title = Title::makeTitle( NS_CATEGORY, $row->cl_to ); + $this->doArticle( $title ); + } + } + + + /** Write an article specified by title */ + function doArticle( $title ) { + global $wgTitle, $wgSharedUploadPath, $wgSharedUploadDirectory; + global $wgUploadDirectory; + + $text = $this->getArticleHTML( $title ); + if ( $text === false ) { + return; } + + # Parse the XHTML to find the images + $images = $this->findImages( $text ); + $this->copyImages( $images ); + + # Write to file + $this->writeArticle( $title, $text ); + } + + /** Write the given text to the file identified by the given title object */ + function writeArticle( &$title, $text ) { + $filename = $title->getHashedFilename(); + $fullName = "{$this->dest}/$filename"; + $fullDir = dirname( $fullName ); + + wfMkdirParents( $fullDir, 0755 ); $file = fopen( $fullName, 'w' ); if ( !$file ) { @@ -104,7 +187,148 @@ function dumpHTML( $dest, $start ) { fwrite( $file, $text ); fclose( $file ); } -} + + /** Set up globals required for parsing */ + function setupGlobals( $depth = NULL ) { + global $wgUser, $wgTitle, $wgMakeDumpLinks, $wgStylePath, $wgArticlePath; + global $wgUploadPath, $wgLogo, $wgMaxCredits, $wgSharedUploadPath; + global $wgHideInterlanguageLinks, $wgUploadDirectory, $wgThumbnailScriptPath; + global $wgSharedThumbnailScriptPath, $wgEnableParserCache; + + if ( is_null( $depth ) ) { + $wgMakeDumpLinks = $this->depth; + } else { + $wgMakeDumpLinks = $depth; + } + + $wgScriptPath = '..' . str_repeat( '/..', $wgMakeDumpLinks ); + $wgArticlePath = str_repeat( '../', $wgMakeDumpLinks ) . '$1'; + $wgStylePath = "$wgScriptPath/skins"; + $wgUploadPath = "$wgScriptPath/images"; + $wgSharedUploadPath = "$wgUploadPath/shared"; + $wgLogo = "$wgStylePath/common/images/wiki.png"; + $wgMaxCredits = -1; + $wgHideInterlangageLinks = !$this->interwiki; + $wgThumbnailScriptPath = $wgSharedThumbnailScriptPath = false; + $wgEnableParserCache = false; + + $wgUser = new User; + $wgUser->setOption( 'skin', 'htmldump' ); + $wgUser->setOption( 'editsection', 0 ); + + $this->sharedStaticPath = "$wgUploadDirectory/shared"; + + } + + /** Reads the content of a title object, executes the skin and captures the result */ + function getArticleHTML( &$title ) { + global $wgOut, $wgTitle, $wgArticle, $wgUser, $wgUseCategoryMagic; + + $wgOut = new OutputPage; + $wgOut->setParserOptions( new ParserOptions ); + + $wgTitle =& $title; + if ( is_null( $wgTitle ) ) { + return false; + } + + $ns = $wgTitle->getNamespace(); + if ( $ns == NS_SPECIAL ) { + SpecialPage::executePath( $wgTitle ); + } else { + if ( $ns == NS_IMAGE ) { + $wgArticle = new ImagePage( $wgTitle ); + } elseif ( $wgUseCategoryMagic && $ns == NS_CATEGORY ) { + $wgArticle = new CategoryPage( $wgTitle ); + } else { + $wgArticle = new Article( $wgTitle ); + } + $wgArticle->view(); + } + + $sk =& $wgUser->getSkin(); + ob_start(); + $sk->outputPage( $wgOut ); + $text = ob_get_contents(); + ob_end_clean(); + + return $text; + } + + /** Returns image paths used in an XHTML document */ + function findImages( $text ) { + global $wgOutputEncoding, $wgDumpImages; + $parser = xml_parser_create( $wgOutputEncoding ); + xml_set_element_handler( $parser, 'wfDumpStartTagHandler', 'wfDumpEndTagHandler' ); + + $wgDumpImages = array(); + xml_parse( $parser, $text ); + xml_parser_free( $parser ); + + return $wgDumpImages; + } + + /** + * Copy images (or create symlinks) from commons to a static directory. + * This is necessary even if you intend to distribute all of commons, because + * the directory contents is used to work out which image description pages + * are needed. + */ + function copyImages( $images ) { + global $wgSharedUploadPath, $wgSharedUploadDirectory; + # Find shared uploads and copy them into the static directory + $sharedPathLength = strlen( $wgSharedUploadPath ); + foreach ( $images as $image => $dummy ) { + # Is it shared? + if ( substr( $image, 0, $sharedPathLength ) == $wgSharedUploadPath ) { + # Reconstruct full filename + $rel = substr( $image, $sharedPathLength + 1 ); // +1 for slash + $sourceLoc = "$wgSharedUploadDirectory/$rel"; + $staticLoc = "{$this->sharedStaticPath}/$rel"; + #print "Copying $sourceLoc to $staticLoc\n"; + # Copy to static directory + if ( !file_exists( $staticLoc ) ) { + wfMkdirParents( dirname( $staticLoc ), 0755 ); + if ( function_exists( 'symlink' ) ) { + symlink( $staticLoc, $sourceLoc ); + } else { + copy( $sourceLoc, $staticLoc ); + } + } + + if ( substr( $rel, 0, 6 ) == 'thumb/' ) { + # That was a thumbnail + # We will also copy the real image + $parts = explode( '/', $rel ); + $rel = "{$parts[1]}/{$parts[2]}/{$parts[3]}"; + $sourceLoc = "$wgSharedUploadDirectory/$rel"; + $staticLoc = "{$this->sharedStaticPath}/$rel"; + #print "Copying $sourceLoc to $staticLoc\n"; + if ( !file_exists( $staticLoc ) ) { + wfMkdirParents( dirname( $staticLoc ), 0755 ); + if ( function_exists( 'symlink' ) ) { + symlink( $staticLoc, $sourceLoc ); + } else { + copy( $sourceLoc, $staticLoc ); + } + } + } + } + } + } +} + +/** XML parser callback */ +function wfDumpStartTagHandler( $parser, $name, $attribs ) { + global $wgDumpImages; + + if ( $name == 'IMG' && isset( $attribs['SRC'] ) ) { + $wgDumpImages[$attribs['SRC']] = true; + } +} + +/** XML parser callback */ +function wfDumpEndTagHandler( $parser, $name ) {} # vim: syn=php ?> diff --git a/maintenance/dumpHTML.php b/maintenance/dumpHTML.php index 6ba3fc58c9..4bdb424fd1 100644 --- a/maintenance/dumpHTML.php +++ b/maintenance/dumpHTML.php @@ -7,13 +7,13 @@ /** */ -$optionsWithArgs = array( 's', 'd' ); +$optionsWithArgs = array( 's', 'd', 'e' ); require_once( "commandLine.inc" ); require_once( "dumpHTML.inc" ); error_reporting( E_ALL & (~E_NOTICE) ); - +define( 'CHUNK_SIZE', 50 ); if ( !empty( $options['s'] ) ) { $start = $options['s']; @@ -21,13 +21,48 @@ if ( !empty( $options['s'] ) ) { $start = 1; } +if ( !empty( $options['e'] ) ) { + $end = $options['e']; +} else { + $dbr =& wfGetDB( DB_SLAVE ); + $end = $dbr->selectField( 'page', 'max(page_id)', false ); +} + if ( !empty( $options['d'] ) ) { $dest = $options['d']; } else { $dest = 'static'; } -dumpHTML( $dest, $start ); +$d = new DumpHTML( $dest, true, 3 ); + +if ( $options['special'] ) { + $d->doSpecials(); +} elseif ( $options['images'] ) { + $d->doImageDescriptions(); +} elseif ( $options['categories'] ) { + $d->doCategories(); +} else { + if ( $end - $start > CHUNK_SIZE * 2 ) { + // Split the problem into smaller chunks, run them in different PHP instances + // This is a memory/resource leak workaround + print("Creating static HTML dump. Starting from page_id $start of $end.\n"); + chdir( "maintenance" ); + for ( $chunkStart = $start; $chunkStart < $end; $chunkStart += CHUNK_SIZE ) { + $chunkEnd = $chunkStart + CHUNK_SIZE - 1; + if ( $chunkEnd > $end ) { + $chunkEnd = $end; + } + passthru( "php dumpHTML.php -s $chunkStart -e $chunkEnd" ); + } + chdir( ".." ); + $d->doImageDescriptions(); + $d->doCategories(); + $d->doMainPage( $dest ); + } else { + $d->doArticles( $start, $end ); + } +} exit(); -- 2.20.1