From f3348367296bcc2f2ec26e27d16f88123128aa5b Mon Sep 17 00:00:00 2001 From: =?utf8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Thu, 3 Nov 2005 00:23:07 +0000 Subject: [PATCH] * Completely rewrote this to use OO * Support nonstandard namespaces --- maintenance/generateSitemap.php | 287 +++++++++++++++++++++----------- 1 file changed, 189 insertions(+), 98 deletions(-) diff --git a/maintenance/generateSitemap.php b/maintenance/generateSitemap.php index c72bd626da..6240804a4e 100644 --- a/maintenance/generateSitemap.php +++ b/maintenance/generateSitemap.php @@ -1,111 +1,202 @@ + * @copyright Copyright © 2005, Brion Vibber + * + * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html + * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd + * + * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later */ -# Copyright (C) 2005 Jens Frank , Brion Vibber -# http://www.mediawiki.org/ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# http://www.gnu.org/copyleft/gpl.html - -if ( $argc < 2) { - print "Usage: php generateSitemap.php servername [options]\n"; - print " servername is the name of the website, e.g. mywiki.mydomain.org\n"; - exit ; +$optionsWithArgs = array( 'host' ); +/* */ +require_once 'commandLine.inc'; + +if ( ! isset( $options['host'] ) ) { + echo "Usage: php generateSitemap.php --host=hostname\n"; + exit(1); +} else { + $_SERVER['HOSTNAME'] = $options['host']; } -$_SERVER['HOSTNAME'] = $argv[1]; -print $argv[1] . "\n"; - - -/** */ -require_once( "commandLine.inc" ); - print "DB name: $wgDBname\n"; - print "DB user: $wgDBuser\n"; - -$priorities = array ( - NS_MAIN => 0.9, - NS_TALK => 0.4, - NS_USER => 0.3, - NS_USER_TALK => 0.3, - NS_PROJECT => 0.5, - NS_PROJECT_TALK => 0.2, - NS_IMAGE => 0.2, - NS_IMAGE_TALK => 0.1, - NS_MEDIAWIKI => 0.1, - NS_MEDIAWIKI_TALK => 0.1, - NS_TEMPLATE => 0.1, - NS_TEMPLATE_TALK => 0.1, - NS_HELP => 0.3, - NS_HELP_TALK => 0.1, - NS_CATEGORY => 0.3, - NS_CATEGORY_TALK => 0.1, -); - -$dbr =& wfGetDB( DB_SLAVE ); -$page = $dbr->tableName( 'page' ); - -$findex = fopen( "sitemap-index-$wgDBname.xml", "wb" ); -fwrite( $findex, '' . "\n" . -'' . "\n" ); - -foreach ( $priorities as $ns => $priority) { - $sql = "SELECT page_namespace,page_title,page_is_redirect,page_touched FROM $page WHERE page_namespace = $ns"; - print "DB query : $sql\nprocessing ..."; - $res = $dbr->query( $sql ); - print " done\n"; - - $gzfile = false; - $rowcount=0; - $sitemapcount=0; - while ( $row = $dbr->fetchObject( $res ) ) { - if ( $rowcount % 9000 == 0 ) { - if ( $gzfile !== false ) { - gzwrite( $gzfile, '' ); - gzclose( $gzfile ); + +$gs = new GenerateSitemap( $options['host'] ); +$gs->main(); + +class GenerateSitemap { + var $host; + var $cutoff = 9000; + var $priorities = array( + // Custom main namespaces + -2 => '0.5', + // Custom talk namesspaces + -1 => '0.1', + NS_MAIN => '1.0', + NS_TALK => '0.1', + NS_USER => '0.5', + NS_USER_TALK => '0.1', + NS_PROJECT => '0.5', + NS_PROJECT_TALK => '0.5', + NS_IMAGE => '0.5', + NS_IMAGE_TALK => '0.1', + NS_MEDIAWIKI => '0.0', + NS_MEDIAWIKI_TALK => '0.0', + NS_TEMPLATE => '0.0', + NS_TEMPLATE_TALK => '0.0', + NS_HELP => '0.5', + NS_HELP_TALK => '0.1', + NS_CATEGORY => '0.5', + NS_CATEGORY_TALK => '0.1', + ); + var $namespaces = array(); + var $dbr; + var $file, $findex; + var $stderr; + + function GenerateSitemap( $host ) { + global $wgDBname; + + $this->stderr = fopen( 'php://stderr', 'wt' ); + + $this->host = $host; + $this->dbr =& wfGetDB( DB_SLAVE ); + $this->generateNamespaces(); + $this->findex = fopen( "sitemap-index-$wgDBname.xml", 'wb' ); + } + + function generateNamespaces() { + $fname = 'GenerateSitemap::generateNamespaces'; + + $res = $this->dbr->select( 'page', + array( 'page_namespace' ), + array(), + $fname, + array( + 'GROUP BY' => 'page_namespace', + 'ORDER BY' => 'page_namespace', + ) + ); + + while ( $row = $this->dbr->fetchObject( $res ) ) + $this->namespaces[] = $row->page_namespace; + } + + function priority( $namespace ) { + return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace ); + } + + function guessPriority( $namespace ) { + return Namespace::isTalk( $namespace ) ? $this->priorities[-1] : $this->priorities[-2]; + } + + function getPageRes( $namespace ) { + $fname = 'GenerateSitemap::getPageRes'; + + return $this->dbr->select( 'page', + array( + 'page_namespace', + 'page_title', + 'page_is_redirect', + 'page_touched', + ), + array( 'page_namespace' => $namespace ), + $fname + ); + } + + function main() { + global $wgDBname; + + fwrite( $this->findex, $this->openIndex() ); + + foreach ( $this->namespaces as $namespace ) { + $res = $this->getPageRes( $namespace ); + $this->file = false; + $i = $smcount = 0; + + while ( $row = $this->dbr->fetchObject( $res ) ) { + if ( $i % $this->cutoff == 0 ) { + if ( $this->file !== false ) { + gzwrite( $this->file, $this->closeFile() ); + gzclose( $this->file ); + } + ++$smcount; + $filename = "sitemap-$wgDBname-NS$namespace-$smcount.xml.gz"; + $this->file = gzopen( $filename, 'wb' ); + $this->debug( $namespace ); + gzwrite( $this->file, $this->openFile() ); + fwrite( $this->findex, $this->indexEntry( $filename ) ); + $this->debug( "\t$filename" ); + } + ++$i; + $title = Title::makeTitle( $row->page_namespace, $row->page_title ); + $date = $this->ISO8601( $row->page_touched ); + gzwrite( $this->file, $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ) ); + } + if ( $this->file ) { + gzwrite( $this->file, $this->closeFile() ); + gzclose( $this->file ); } - $sitemapcount ++; - $fname = "sitemap-{$wgDBname}-NS{$ns}-{$sitemapcount}.xml.gz"; - $gzfile = gzopen( $fname, "wb" ); - gzwrite( $gzfile, '' . "\n" . - '' . "\n" ); - fwrite( $findex, "\t\n\t\t$wgServer/$fname\n\t\n" ); - print "$fname\n"; } - $rowcount ++; - $nt = Title::makeTitle( $row->page_namespace, $row->page_title ); - $date = substr($row->page_touched, 0, 4). '-' . - substr($row->page_touched, 4, 2). '-' . - substr($row->page_touched, 6, 2); - gzwrite( $gzfile, "\t\n\t\t" . $nt->getFullURL() . - "\n\t\t$date\n" . - "\t\t$priority\n" . - "\t\n" ); - } - if ( $gzfile ) { - gzwrite( $gzfile, "\n" ); - gzclose( $gzfile ); - } - print "\n"; -} -fwrite( $findex, "\n" ); -fclose( $findex ); + fwrite( $this->findex, $this->closeIndex() ); + fclose( $this->findex ); + } + + function xmlHead() { + return '' . "\n"; + } + + function xmlSchema() { + return 'http://www.google.com/schemas/sitemap/0.84'; + } + + function openIndex() { + return $this->xmlHead() . '' . "\n"; + } + + function indexEntry( $filename ) { + global $wgServer; + + return + "\t\n" . + "\t\t$wgServer/$filename\n" . + "\t\n"; + } + function closeIndex() { + return "\n"; + } + + function openFile() { + return $this->xmlHead() . '' . "\n"; + } + + function fileEntry( $url, $date, $priority ) { + return + "\t\n" . + "\t\t$url\n" . + "\t\t$date\n" . + "\t\t$priority\n" . + "\t\n"; + } + + function closeFile() { + return "\n"; + } + + function ISO8601( $timestamp ) { + return substr( wfTimestamp( TS_DB, $timestamp ), 0, 4 + 1 + 2 + 1 + 2 ); + } + + function debug( $str ) { + fwrite( $this->stderr, "$str\n" ); + } +} ?> -- 2.20.1