* @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
*/
-$optionsWithArgs = array( 'host', 'path' );
+if ( isset( $argv[1] ) )
+ $_SERVER['SERVER_NAME'] = $argv[1];
+
+$optionsWithArgs = array( 'path' );
/* */
require_once 'commandLine.inc';
-if ( ! isset( $options['host'] ) ) {
- echo "Usage: php generateSitemap.php --host=hostname [--path=/pa/th/]\n";
- exit(1);
-} else {
- $_SERVER['HOSTNAME'] = $options['host'];
-}
+define( 'GS_MAIN', -2 );
+define( 'GS_TALK', -1 );
-$gs = new GenerateSitemap( $options['host'], $options['path'] );
+$gs = new GenerateSitemap( @$options['path'] );
$gs->main();
class GenerateSitemap {
- var $host;
- var $cutoff = 9000;
+ /**
+ * The number of entries to save in each sitemap file
+ *
+ * @var int
+ */
+ var $limit;
+
+ /**
+ * Key => value entries of namespaces and their priorities
+ *
+ * @var array
+ */
var $priorities = array(
// Custom main namespaces
- -2 => '0.5',
+ GS_MAIN => '0.5',
// Custom talk namesspaces
- -1 => '0.1',
+ GS_TALK => '0.1',
+ // MediaWiki standard namespaces
NS_MAIN => '1.0',
NS_TALK => '0.1',
NS_USER => '0.5',
NS_CATEGORY => '0.5',
NS_CATEGORY_TALK => '0.1',
);
+
+ /**
+ * A one-dimensional array of namespaces in the wiki
+ *
+ * @var array
+ */
var $namespaces = array();
+
+ /**
+ * A database slave object
+ *
+ * @var object
+ */
var $dbr;
- var $path, $file, $findex;
- var $stderr;
- function GenerateSitemap( $host, $path ) {
+ /**
+ * A resource pointing to the sitemap index file
+ *
+ * @var resource
+ */
+ var $findex;
+
+
+ /**
+ * A resource pointing to a sitemap file
+ *
+ * @var resource
+ */
+ var $file;
+
+ /**
+ * A resource pointing to php://stderr
+ *
+ * @var resource
+ */
+ var $stderr;
+
+ /**
+ * Constructor
+ *
+ * @param string $path The path to prepend to the filenames, used to
+ * save them somewhere else than in the root directory
+ */
+ function GenerateSitemap( $path ) {
global $wgDBname;
$this->path = isset( $path ) ? $path : '';
$this->stderr = fopen( 'php://stderr', 'wt' );
- $this->host = $host;
$this->dbr =& wfGetDB( DB_SLAVE );
$this->generateNamespaces();
+ $this->generateLimit( NS_MAIN );
$this->findex = fopen( "{$this->path}sitemap-index-$wgDBname.xml", 'wb' );
}
+ /**
+ * Generate a one-dimensional array of existing namespaces
+ */
function generateNamespaces() {
$fname = 'GenerateSitemap::generateNamespaces';
$this->namespaces[] = $row->page_namespace;
}
+ /**
+ * Get the priority of a given namespace
+ *
+ * @param int $namespace The namespace to get the priority for
+ +
+ * @return string
+ */
+
function priority( $namespace ) {
return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
}
+ /**
+ * If the namespace isn't listed on the priority list return the
+ * default priority for the namespace, varies depending on whether it's
+ * a talkpage or not.
+ *
+ * @param int $namespace The namespace to get the priority for
+ *
+ * @return string
+ */
function guessPriority( $namespace ) {
- return Namespace::isMain( $namespace ) ? $this->priorities[-2] : $this->priorities[-1];
+ return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
}
+ /**
+ * Return a database resolution of all the pages in a given namespace
+ *
+ * @param int $namespace Limit the query to this namespace
+ *
+ * @return resource
+ */
function getPageRes( $namespace ) {
$fname = 'GenerateSitemap::getPageRes';
);
}
+ /**
+ * Main loop
+ *
+ * @access public
+ */
function main() {
global $wgDBname;
$this->debug( $namespace );
while ( $row = $this->dbr->fetchObject( $res ) ) {
- if ( $i % $this->cutoff == 0 ) {
+ if ( $i % $this->limit === 0 ) {
if ( $this->file !== false ) {
gzwrite( $this->file, $this->closeFile() );
gzclose( $this->file );
}
- $filename = "{$this->path}sitemap-$wgDBname-NS_$namespace-$smcount.xml.gz";
+ $this->generateLimit( $namespace );
+ $filename = "sitemap-$wgDBname-NS_$namespace-$smcount.xml.gz";
++$smcount;
- $this->file = gzopen( $filename, 'wb' );
+ $this->file = gzopen( $this->path . $filename, 'wb' );
gzwrite( $this->file, $this->openFile() );
fwrite( $this->findex, $this->indexEntry( $filename ) );
$this->debug( "\t$filename" );
fclose( $this->findex );
}
+ /**
+ * Return the XML required to open an XML file
+ *
+ * @static
+ *
+ * @return string
+ */
function xmlHead() {
return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
}
+ /**
+ * Return the XML schema being used
+ *
+ * @static
+ *
+ * @returns string
+ */
function xmlSchema() {
return 'http://www.google.com/schemas/sitemap/0.84';
}
+ /**
+ * Return the XML required to open a sitemap index file
+ *
+ * @return string
+ */
function openIndex() {
return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
}
+ /**
+ * Return the XML for a single sitemap indexfile entry
+ *
+ * @static
+ *
+ * @param string $filename The filename of the sitemap file
+ *
+ * @return string
+ */
function indexEntry( $filename ) {
- global $wgServer;
+ global $wgServer, $wgScriptPath;
return
"\t<sitemap>\n" .
- "\t\t<loc>$wgServer/$filename</log>\n" .
+ "\t\t<loc>$wgServer$wgScriptPath/$filename</log>\n" .
"\t</sitemap>\n";
}
+ /**
+ * Return the XML required to close a sitemap index file
+ *
+ * @static
+ *
+ * @return string
+ */
function closeIndex() {
return "</sitemapindex>\n";
}
-
+
+ /**
+ * Return the XML required to open a sitemap file
+ *
+ * @return string
+ */
function openFile() {
return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
}
-
+
+ /**
+ * Return the XML for a single sitemap entry
+ *
+ * @static
+ *
+ * @param string $url An RFC 2396 compilant URL
+ * @param string $date A ISO 8601 date
+ * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
+ *
+ r
+ * @return string
+ */
function fileEntry( $url, $date, $priority ) {
return
"\t<url>\n" .
"\t</url>\n";
}
+ /**
+ * Return the XML required to close sitemap file
+ *
+ * @static
+ * @return string
+ */
function closeFile() {
return "</urlset>\n";
}
-
+
+ /**
+ * Write a string to stderr followed by a UNIX newline
+ */
function debug( $str ) {
fwrite( $this->stderr, "$str\n" );
}
+
+ /**
+ * According to the sitemap specification each sitemap must contain no
+ * more than 50,000 urls and no more than 2^20 bytes (10MB), this
+ * function calculates how many urls we can have in each file assuming
+ * that we have the worst case of 63 four byte characters and 1 three
+ * byte character in the title (63*4+1*3 = 255)
+ */
+ function generateLimit( $namespace ) {
+ $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
+
+ $olen = strlen( $this->openFile() );
+ $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), '1.0' ) );
+ $clen = strlen( $this->closeFile() );
+
+ for ( $i = 1, $etot = $elen; ( $olen + $clen + $etot + $elen ) <= pow( 2, 20 ); ++$i )
+ $etot += $elen;
+
+ $this->limit = $i;
+ }
}
?>