* Completely rewrote this to use OO
authorÆvar Arnfjörð Bjarmason <avar@users.mediawiki.org>
Thu, 3 Nov 2005 00:23:07 +0000 (00:23 +0000)
committerÆvar Arnfjörð Bjarmason <avar@users.mediawiki.org>
Thu, 3 Nov 2005 00:23:07 +0000 (00:23 +0000)
* Support nonstandard namespaces

maintenance/generateSitemap.php

index c72bd62..6240804 100644 (file)
 <?php
 /**
+ * Creates a Google sitemap for the site
+ *
  * @package MediaWiki
  * @subpackage Maintenance
  *
- * Creates a Google sitemap.
- * https://www.google.com/webmasters/sitemaps/docs/en/about.html
+ * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
+ * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
+ * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
+ *
+ * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html
+ * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
+ *
+ * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
  */
 
-# Copyright (C) 2005 Jens Frank <jeluf@gmx.de>, Brion Vibber <brion@pobox.com>
-# http://www.mediawiki.org/
-# 
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or 
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-# http://www.gnu.org/copyleft/gpl.html
-
-if ( $argc < 2) {
-       print "Usage: php generateSitemap.php servername [options]\n";
-       print " servername is the name of the website, e.g. mywiki.mydomain.org\n";
-       exit ;
+$optionsWithArgs = array( 'host' );
+/* */
+require_once 'commandLine.inc';
+
+if ( ! isset( $options['host'] ) ) {
+       echo "Usage: php generateSitemap.php --host=hostname\n";
+       exit(1);
+} else {
+       $_SERVER['HOSTNAME'] = $options['host'];
 }
-$_SERVER['HOSTNAME'] = $argv[1];
-print $argv[1] . "\n";
-
-
-/** */
-require_once( "commandLine.inc" );
- print "DB name: $wgDBname\n";
- print "DB user: $wgDBuser\n";
-
-$priorities = array (
-        NS_MAIN             => 0.9,
-        NS_TALK             => 0.4,
-        NS_USER             => 0.3,
-        NS_USER_TALK        => 0.3,
-        NS_PROJECT          => 0.5,
-        NS_PROJECT_TALK     => 0.2,
-        NS_IMAGE            => 0.2,
-        NS_IMAGE_TALK       => 0.1,
-        NS_MEDIAWIKI        => 0.1,
-        NS_MEDIAWIKI_TALK   => 0.1,
-        NS_TEMPLATE         => 0.1,
-        NS_TEMPLATE_TALK    => 0.1,
-        NS_HELP             => 0.3,
-        NS_HELP_TALK        => 0.1,
-        NS_CATEGORY         => 0.3,
-        NS_CATEGORY_TALK    => 0.1,
-);
-
-$dbr =& wfGetDB( DB_SLAVE );
-$page = $dbr->tableName( 'page' );
-
-$findex = fopen( "sitemap-index-$wgDBname.xml", "wb" );
-fwrite( $findex, '<?xml version="1.0" encoding="UTF-8"?>' . "\n" . 
-'<sitemapindex xmlns="http://www.google.com/schemas/sitemap/0.84">' . "\n" );
-
-foreach ( $priorities as $ns => $priority) {
-       $sql = "SELECT page_namespace,page_title,page_is_redirect,page_touched FROM $page WHERE page_namespace = $ns";
-       print "DB query : $sql\nprocessing ...";
-       $res = $dbr->query( $sql );
-       print " done\n";
-
-       $gzfile = false;
-       $rowcount=0;
-       $sitemapcount=0;
-       while ( $row = $dbr->fetchObject( $res ) ) {
-               if ( $rowcount % 9000 == 0 ) {
-                       if ( $gzfile !== false ) {
-                               gzwrite( $gzfile, '</urlset>' );
-                               gzclose( $gzfile );
+
+$gs = new GenerateSitemap( $options['host'] );
+$gs->main();
+
+class GenerateSitemap {
+       var $host;
+       var $cutoff = 9000;
+       var $priorities = array(
+               // Custom main namespaces
+               -2                      => '0.5',
+               // Custom talk namesspaces
+               -1                      => '0.1',       
+               NS_MAIN                 => '1.0',
+               NS_TALK                 => '0.1',
+               NS_USER                 => '0.5',
+               NS_USER_TALK            => '0.1',
+               NS_PROJECT              => '0.5',
+               NS_PROJECT_TALK         => '0.5',
+               NS_IMAGE                => '0.5',
+               NS_IMAGE_TALK           => '0.1',
+               NS_MEDIAWIKI            => '0.0',
+               NS_MEDIAWIKI_TALK       => '0.0',
+               NS_TEMPLATE             => '0.0',
+               NS_TEMPLATE_TALK        => '0.0',
+               NS_HELP                 => '0.5',
+               NS_HELP_TALK            => '0.1',
+               NS_CATEGORY             => '0.5',
+               NS_CATEGORY_TALK        => '0.1',
+       );
+       var $namespaces = array();
+       var $dbr;
+       var $file, $findex;
+       var $stderr;
+       
+       function GenerateSitemap( $host ) {
+               global $wgDBname;
+
+               $this->stderr = fopen( 'php://stderr', 'wt' );
+               
+               $this->host = $host;
+               $this->dbr =& wfGetDB( DB_SLAVE );
+               $this->generateNamespaces();
+               $this->findex = fopen( "sitemap-index-$wgDBname.xml", 'wb' );
+       }
+
+       function generateNamespaces() {
+               $fname = 'GenerateSitemap::generateNamespaces';
+               
+               $res = $this->dbr->select( 'page',
+                       array( 'page_namespace' ),
+                       array(),
+                       $fname,
+                       array(
+                               'GROUP BY' => 'page_namespace',
+                               'ORDER BY' => 'page_namespace',
+                       )
+               );
+
+               while ( $row = $this->dbr->fetchObject( $res ) )
+                       $this->namespaces[] = $row->page_namespace;
+       }
+
+       function priority( $namespace ) {
+               return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
+       }
+
+       function guessPriority( $namespace ) {
+               return Namespace::isTalk( $namespace ) ? $this->priorities[-1] : $this->priorities[-2];
+       }
+
+       function getPageRes( $namespace ) {
+               $fname = 'GenerateSitemap::getPageRes';
+
+               return $this->dbr->select( 'page',
+                       array( 
+                               'page_namespace',
+                               'page_title',
+                               'page_is_redirect',
+                               'page_touched',
+                       ),
+                       array( 'page_namespace' => $namespace ),
+                       $fname
+               );
+       }
+
+       function main() {
+               global $wgDBname;
+
+               fwrite( $this->findex, $this->openIndex() );
+               
+               foreach ( $this->namespaces as $namespace ) {
+                       $res = $this->getPageRes( $namespace );
+                       $this->file = false;
+                       $i = $smcount = 0;
+                       
+                       while ( $row = $this->dbr->fetchObject( $res ) ) {
+                               if ( $i % $this->cutoff == 0 ) {
+                                       if ( $this->file !== false ) {
+                                               gzwrite( $this->file, $this->closeFile() );
+                                               gzclose( $this->file );
+                                       }
+                                       ++$smcount;
+                                       $filename = "sitemap-$wgDBname-NS$namespace-$smcount.xml.gz";
+                                       $this->file = gzopen( $filename, 'wb' );
+                                       $this->debug( $namespace );
+                                       gzwrite( $this->file, $this->openFile() );
+                                       fwrite( $this->findex, $this->indexEntry( $filename ) );
+                                       $this->debug( "\t$filename" );
+                               }
+                               ++$i;
+                               $title = Title::makeTitle( $row->page_namespace, $row->page_title );
+                               $date = $this->ISO8601( $row->page_touched );
+                               gzwrite( $this->file, $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ) );
+                       }
+                       if ( $this->file ) {
+                               gzwrite( $this->file, $this->closeFile() );
+                               gzclose( $this->file );
                        }
-                       $sitemapcount ++;
-                       $fname = "sitemap-{$wgDBname}-NS{$ns}-{$sitemapcount}.xml.gz";
-                       $gzfile = gzopen( $fname, "wb" );
-                       gzwrite( $gzfile, '<?xml version="1.0" encoding="UTF-8"?>' . "\n" . 
-                               '<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">' . "\n" );
-                       fwrite( $findex, "\t<sitemap>\n\t\t<loc>$wgServer/$fname</loc>\n\t</sitemap>\n" );
-                       print "$fname\n";
                }
-               $rowcount ++;
-               $nt = Title::makeTitle( $row->page_namespace, $row->page_title );
-               $date = substr($row->page_touched, 0, 4). '-' .
-                       substr($row->page_touched, 4, 2). '-' .
-                       substr($row->page_touched, 6, 2);
-               gzwrite( $gzfile, "\t<url>\n\t\t<loc>" . $nt->getFullURL() . 
-                               "</loc>\n\t\t<lastmod>$date</lastmod>\n" .
-                               "\t\t<priority>$priority</priority>\n" .
-                               "\t</url>\n" );
-       }
-       if ( $gzfile ) {
-               gzwrite( $gzfile, "</urlset>\n" );
-               gzclose( $gzfile );
-       }
-       print "\n";
-}
-fwrite( $findex, "</sitemapindex>\n" );
-fclose( $findex );
+               fwrite( $this->findex, $this->closeIndex() );
+               fclose( $this->findex );
+       }
+
+       function xmlHead() {
+               return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
+       }
+
+       function xmlSchema() {
+               return 'http://www.google.com/schemas/sitemap/0.84';
+       }
+
+       function openIndex() {
+               return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
+       }
+
+       function indexEntry( $filename ) {
+               global $wgServer;
+               
+               return
+                       "\t<sitemap>\n" .
+                       "\t\t<loc>$wgServer/$filename</log>\n" .
+                       "\t</sitemap>\n";
+       }
 
+       function closeIndex() {
+               return "</sitemapindex>\n";
+       }
+
+       function openFile() {
+               return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
+       }
+       
+       function fileEntry( $url, $date, $priority ) {
+               return
+                       "\t<url>\n" .
+                       "\t\t<loc>$url</loc>\n" .
+                       "\t\t<lastmod>$date</lastmod>\n" .
+                       "\t\t<priority>$priority</priority>\n" .
+                       "\t</url>\n";
+       }
+
+       function closeFile() {
+               return "</urlset>\n";
+       }
+       
+       function ISO8601( $timestamp ) {
+               return substr( wfTimestamp( TS_DB, $timestamp ), 0, 4 + 1 + 2 + 1 + 2 );
+       }
+
+       function debug( $str ) {
+               fwrite( $this->stderr, "$str\n" );
+       }
+}
 
 ?>