Basic static HTML dump generator, experimental
authorTim Starling <tstarling@users.mediawiki.org>
Sun, 27 Mar 2005 16:05:33 +0000 (16:05 +0000)
committerTim Starling <tstarling@users.mediawiki.org>
Sun, 27 Mar 2005 16:05:33 +0000 (16:05 +0000)
includes/DefaultSettings.php
includes/Title.php
maintenance/dumpHTML.inc [new file with mode: 0644]
maintenance/dumpHTML.php [new file with mode: 0644]

index 4e4aeae..61c2c92 100644 (file)
@@ -102,6 +102,11 @@ $wgTmpDirectory     = "{$wgUploadDirectory}/tmp";
 $wgUploadBaseUrl    = "";
 /**#@-*/
 
+/**
+ * Produce hashed HTML article paths. Used internally, do not set.
+ */ 
+$wgMakeDumpLinks = false;
+
 /**
  * To set 'pretty' URL paths for actions other than
  * plain page views, add to this array. For instance:
index e6454a3..63390a1 100644 (file)
@@ -616,10 +616,9 @@ class Title {
         * @access public
         */
        function getFullURL( $query = '' ) {
-               global $wgContLang, $wgArticlePath, $wgServer, $wgScript;
+               global $wgContLang, $wgServer, $wgScript;
 
                if ( '' == $this->mInterwiki ) {
-                       $p = $wgArticlePath;
                        return $wgServer . $this->getLocalUrl( $query );
                } else {
                        $baseUrl = $this->getInterwikiLink( $this->mInterwiki );
@@ -644,6 +643,35 @@ class Title {
                }
        }
 
+       /** 
+        * Get a relative directory for putting an HTML version of this article into
+        */
+       function getHashedDirectory() {
+               $dbkey = $this->getPrefixedDBkey();
+               if ( strlen( $dbkey ) < 2 ) {
+                       $dbkey = sprintf( "%2s", $dbkey );
+               }
+               $dir = '';
+               for ( $i=0; $i<=1; $i++ ) {
+                       if ( $i ) {
+                               $dir .= '/';
+                       }
+                       if ( ord( $dbkey{$i} ) < 128 && ord( $dbkey{$i} ) > 32 ) {
+                               $dir .= strtolower( $dbkey{$i} );
+                       } else {
+                               $dir .= sprintf( "%02X", ord( $dbkey{$i} ) );
+                       }
+               }
+               return $dir;
+       }
+       
+       function getHashedFilename() {
+               $dbkey = $this->getPrefixedDBkey();
+               $dir = $this->getHashedDirectory();
+               $friendlyName = strtr( $dbkey, '/\\:*?"<>|', '_________' );
+               return "$dir/$friendlyName.html";       
+       }
+       
        /**
         * Get a URL with no fragment or server name
         * @param string $query an optional query string; if not specified,
@@ -652,14 +680,16 @@ class Title {
         * @access public
         */
        function getLocalURL( $query = '' ) {
-               global $wgLang, $wgArticlePath, $wgScript;
+               global $wgLang, $wgArticlePath, $wgScript, $wgMakeDumpLinks;
                
                if ( $this->isExternal() ) {
                        return $this->getFullURL();
                }
-
+               
                $dbkey = wfUrlencode( $this->getPrefixedDBkey() );
-               if ( $query == '' ) {
+               if ( $wgMakeDumpLinks ) {
+                       $url = str_replace( '$1', wfUrlencode( $this->getHashedFilename() ), $wgArticlePath );
+               } elseif ( $query == '' ) {
                        $url = str_replace( '$1', $dbkey, $wgArticlePath );
                } else {
                        if( preg_match( '/^(.*&|)action=([^&]*)(&(.*)|)$/', $query, $matches ) ) {
diff --git a/maintenance/dumpHTML.inc b/maintenance/dumpHTML.inc
new file mode 100644 (file)
index 0000000..7c145a9
--- /dev/null
@@ -0,0 +1,107 @@
+<?php
+
+define( 'REPORTING_INTERVAL', 10 );
+
+function dumpHTML( $dest, $start ) {
+       global $wgUser, $wgTitle, $wgArticle, $wgEnablePersistentLC, $wgLinkCache, $wgOut;
+       global $wgMakeDumpLinks, $wgStylePath, $wgArticlePath, $wgUploadPath, $wgLogo;
+       $wgMakeDumpLinks = true;
+       $wgScriptPath = "../../..";
+       $wgStylePath = "$wgScriptPath/skins";
+       $wgUploadPath = "$wgScriptPath/images";
+       $wgLogo = "$wgStylePath/common/images/wiki.png";
+       $wgArticlePath = '../../$1';
+       $dbr =& wfGetDB( DB_SLAVE );
+       $end = $dbr->selectField( 'cur', 'max(cur_id)', false );
+       
+       /*global $wgValidSkinNames;
+       var_dump( $wgValidSkinNames );
+       exit;*/
+
+       print("Creating static HTML dump. Starting from cur_id $start of $end.\n");
+
+       $wgUser = new User;
+       $wgUser->setOption( 'skin', 'htmldump' );
+       $sk =& $wgUser->getSkin();
+       
+       if ( !is_dir( $dest ) ) {
+               if ( !mkdir( $dest, 0644 ) ) {
+                       print("Can't make directory $dir, exiting\n");
+                       return;
+               }
+       }
+       
+       for ($id = $start; $id <= $end; $id++) {
+               if ( !($id % REPORTING_INTERVAL) ) {
+                       print("$id\n");
+               }
+               
+               $wgOut = new OutputPage;
+               $wgOut->setArticleFlag( true );
+               $wgOut->setRobotpolicy( 'index,follow' );
+               
+               $wgTitle = Title::newFromID( $id );
+               if ( is_null( $wgTitle ) ) {
+                       continue;
+               }
+
+               $wgArticle = new Article( $wgTitle );
+               $text = $wgArticle->getContent( true );
+               $wgLinkCache = new LinkCache;
+               $wgLinkCache->forUpdate( true );
+               
+               global $wgLinkHolders;
+               $wgLinkHolders = array(
+                       'namespaces' => array(),
+                       'dbkeys' => array(),
+                       'queries' => array(),
+                       'texts' => array(),
+                       'titles' => array()
+               );
+
+
+               # Parse the text and replace links with placeholders
+               $wgOut->setPageTitle( $wgTitle->getPrefixedText() );
+               $wgOut->addWikiText( $text );
+               $wgOut->transformBuffer();
+               
+               # Execute skin to get complete HTML
+               ob_start();
+               $sk->outputPage( $wgOut );
+               $text = ob_get_contents();
+               ob_end_clean();
+               
+               # Write to file
+               $fname = $wgTitle->getHashedFilename();
+               $bits = explode( '/', $fname );
+               $parentDir = "$dest/{$bits[0]}";
+               $fullDir = "$dest/{$bits[0]}/{$bits[1]}";
+               $fullName = "$dest/$fname";
+
+               
+               if ( !is_dir( $parentDir ) ) {
+                       if ( !mkdir( $parentDir, 0644 ) ) {
+                               print("Can't write to directory $parentDir\n");
+                               return;
+                       }
+               }
+               if ( !is_dir( $fullDir ) ) {
+                       if ( !mkdir( $fullDir, 0644 ) ) {
+                               print("Can't write to directory $fullDir\n");
+                               return;
+                       }
+               }
+               
+               $file = fopen( $fullName, 'w' );
+               if ( !$file ) {
+                       print("Can't open file $fullName for writing\n");
+                       return;
+               }
+               
+               fwrite( $file, $text );
+               fclose( $file );
+       }
+}      
+
+# vim: syn=php
+?>
diff --git a/maintenance/dumpHTML.php b/maintenance/dumpHTML.php
new file mode 100644 (file)
index 0000000..6ba3fc5
--- /dev/null
@@ -0,0 +1,34 @@
+<?php
+/**
+ * @todo document
+ * @package MediaWiki
+ * @subpackage Maintenance
+ */
+
+/** */
+
+$optionsWithArgs = array( 's', 'd' );
+
+require_once( "commandLine.inc" );
+require_once( "dumpHTML.inc" );
+
+error_reporting( E_ALL & (~E_NOTICE) );
+
+
+if ( !empty( $options['s'] ) ) {
+       $start = $options['s'];
+} else {
+       $start = 1;
+}
+
+if ( !empty( $options['d'] ) ) {
+       $dest = $options['d'];
+} else {
+       $dest = 'static';
+}
+
+dumpHTML( $dest, $start );
+
+exit();
+
+?>