Generator for Google sitemaps
authorJens Frank <jeluf@users.mediawiki.org>
Sat, 22 Oct 2005 10:40:49 +0000 (10:40 +0000)
committerJens Frank <jeluf@users.mediawiki.org>
Sat, 22 Oct 2005 10:40:49 +0000 (10:40 +0000)
maintenance/generateSitemap.php [new file with mode: 0644]

diff --git a/maintenance/generateSitemap.php b/maintenance/generateSitemap.php
new file mode 100644 (file)
index 0000000..9daa3db
--- /dev/null
@@ -0,0 +1,116 @@
+<?php
+/**
+ * @package MediaWiki
+ * @subpackage Maintenance
+ *
+ * Creates a Google sitemap.
+ * https://www.google.com/webmasters/sitemaps/docs/en/about.html
+ */
+
+# Copyright (C) 2005 Jens Frank <jeluf@gmx.de>, Brion Vibber <brion@pobox.com>
+# http://www.mediawiki.org/
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or 
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+if ( $argc < 2) {
+       print "Usage: php generateSitemap.sql servername [options]\n";
+       print " servername is the name of the website, e.g. mywiki.mydomain.org\n";
+       exit ;
+}
+$_SERVER['HOSTNAME'] = $argv[1];
+print $argv[1] . "\n";
+
+
+/** */
+require_once( "commandLine.inc" );
+ print "DB name: $wgDBname\n";
+ print "DB user: $wgDBuser\n";
+# print "DB password: $wgDBpassword\n";
+
+
+$priorities = array (
+        NS_MAIN             => 0.9,
+        NS_TALK             => 0.4,
+        NS_USER             => 0.3,
+        NS_USER_TALK        => 0.3,
+        NS_PROJECT          => 0.5,
+        NS_PROJECT_TALK     => 0.2,
+        NS_IMAGE            => 0.2,
+        NS_IMAGE_TALK       => 0.1,
+        NS_MEDIAWIKI        => 0.1,
+        NS_MEDIAWIKI_TALK   => 0.1,
+        NS_TEMPLATE         => 0.1,
+        NS_TEMPLATE_TALK    => 0.1,
+        NS_HELP             => 0.3,
+        NS_HELP_TALK        => 0.1,
+        NS_CATEGORY         => 0.3,
+        NS_CATEGORY_TALK    => 0.1,
+);
+
+$dbr =& wfGetDB( DB_SLAVE );
+$page = $dbr->tableName( 'page' );
+$rev = $dbr->tableName( 'revision' );
+
+$findex = fopen( "sitemap_index.xml", "wb" );
+fwrite( $findex, '<?xml version="1.0" encoding="UTF-8"?>
+   <sitemapindex xmlns="http://www.google.com/schemas/sitemap/0.84">
+   ' );
+
+foreach ( $priorities as $ns => $priority) {
+       $sql = "SELECT page_namespace,page_title,page_is_redirect,rev_timestamp  FROM $page, $rev ".
+               "WHERE page_namespace = $ns AND page_latest = rev_id ";
+       print "DB query : $sql\nprocessing ...";
+       $res = $dbr->query( $sql );
+       print " done\n";
+
+       $gzfile = false;
+       $rowcount=0;
+       $sitemapcount=0;
+       while ( $row = $dbr->fetchObject( $res ) ) {
+               if ( $rowcount % 9000 == 0 ) {
+                       if ( $gzfile !== false ) {
+                               gzwrite( $gzfile, '</urlset>' );
+                               gzclose( $gzfile );
+                       }
+                       $sitemapcount ++;
+                       $fname = "sitemap-NS".$ns."-".$sitemapcount.".xml.gz";
+                       $gzfile = gzopen( $fname, "wb" );
+                       gzwrite( $gzfile, '<?xml version="1.0" encoding="UTF-8"?>
+                                       < urlset xmlns="http://www.google.com/schemas/sitemap/0.84">' );
+                       fwrite( $findex, '<sitemap><loc>'.$wgServer.'/'.$fname."</loc></sitemap>\n" );
+                       print "$fname\n";
+               }
+               $rowcount ++;
+               $nt = Title::makeTitle( $row->page_namespace, $row->page_title );
+               $date = substr($row->rev_timestamp, 0, 4). '-' .
+                       substr($row->rev_timestamp, 4, 2). '-' .
+                       substr($row->rev_timestamp, 6, 2);
+               gzwrite( $gzfile, "<url>\n  <loc>" . $nt->getFullURL() . 
+                               "</loc>\n  <lastmod>".$date."</lastmod>\n  " .
+                               '<priority>' . $priority . '</priority>' .
+                               "\n</url>\n" );
+       }
+       if ( $gzfile ) {
+               gzwrite( $gzfile, "</urlset>\n" );
+               gzclose( $gzfile );
+       }
+       print "\n";
+}
+fwrite( $findex, "</sitemapindex>\n" );
+fclose( $findex );
+
+
+?>