From: Ævar Arnfjörð Bjarmason Date: Thu, 3 Nov 2005 04:23:02 +0000 (+0000) Subject: * Fixed regression: It's now possible to specify a custom hostname again X-Git-Tag: 1.6.0~1234 X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/banques/?a=commitdiff_plain;h=966696d382000c22b8be028b9846ec274316ab47;p=lhc%2Fweb%2Fwiklou.git * Fixed regression: It's now possible to specify a custom hostname again * Added magic code to make sure that the size of the sitemap file never exceeds that allowed by the standard * Documented every function * Made --path really work --- diff --git a/maintenance/generateSitemap.php b/maintenance/generateSitemap.php index b8328085d7..2e933d898a 100644 --- a/maintenance/generateSitemap.php +++ b/maintenance/generateSitemap.php @@ -15,28 +15,38 @@ * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later */ -$optionsWithArgs = array( 'host', 'path' ); +if ( isset( $argv[1] ) ) + $_SERVER['SERVER_NAME'] = $argv[1]; + +$optionsWithArgs = array( 'path' ); /* */ require_once 'commandLine.inc'; -if ( ! isset( $options['host'] ) ) { - echo "Usage: php generateSitemap.php --host=hostname [--path=/pa/th/]\n"; - exit(1); -} else { - $_SERVER['HOSTNAME'] = $options['host']; -} +define( 'GS_MAIN', -2 ); +define( 'GS_TALK', -1 ); -$gs = new GenerateSitemap( $options['host'], $options['path'] ); +$gs = new GenerateSitemap( @$options['path'] ); $gs->main(); class GenerateSitemap { - var $host; - var $cutoff = 9000; + /** + * The number of entries to save in each sitemap file + * + * @var int + */ + var $limit; + + /** + * Key => value entries of namespaces and their priorities + * + * @var array + */ var $priorities = array( // Custom main namespaces - -2 => '0.5', + GS_MAIN => '0.5', // Custom talk namesspaces - -1 => '0.1', + GS_TALK => '0.1', + // MediaWiki standard namespaces NS_MAIN => '1.0', NS_TALK => '0.1', NS_USER => '0.5', @@ -54,23 +64,64 @@ class GenerateSitemap { NS_CATEGORY => '0.5', NS_CATEGORY_TALK => '0.1', ); + + /** + * A one-dimensional array of namespaces in the wiki + * + * @var array + */ var $namespaces = array(); + + /** + * A database slave object + * + * @var object + */ var $dbr; - var $path, $file, $findex; - var $stderr; - function GenerateSitemap( $host, $path ) { + /** + * A resource pointing to the sitemap index file + * + * @var resource + */ + var $findex; + + + /** + * A resource pointing to a sitemap file + * + * @var resource + */ + var $file; + + /** + * A resource pointing to php://stderr + * + * @var resource + */ + var $stderr; + + /** + * Constructor + * + * @param string $path The path to prepend to the filenames, used to + * save them somewhere else than in the root directory + */ + function GenerateSitemap( $path ) { global $wgDBname; $this->path = isset( $path ) ? $path : ''; $this->stderr = fopen( 'php://stderr', 'wt' ); - $this->host = $host; $this->dbr =& wfGetDB( DB_SLAVE ); $this->generateNamespaces(); + $this->generateLimit( NS_MAIN ); $this->findex = fopen( "{$this->path}sitemap-index-$wgDBname.xml", 'wb' ); } + /** + * Generate a one-dimensional array of existing namespaces + */ function generateNamespaces() { $fname = 'GenerateSitemap::generateNamespaces'; @@ -88,14 +139,38 @@ class GenerateSitemap { $this->namespaces[] = $row->page_namespace; } + /** + * Get the priority of a given namespace + * + * @param int $namespace The namespace to get the priority for + + + * @return string + */ + function priority( $namespace ) { return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace ); } + /** + * If the namespace isn't listed on the priority list return the + * default priority for the namespace, varies depending on whether it's + * a talkpage or not. + * + * @param int $namespace The namespace to get the priority for + * + * @return string + */ function guessPriority( $namespace ) { - return Namespace::isMain( $namespace ) ? $this->priorities[-2] : $this->priorities[-1]; + return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK]; } + /** + * Return a database resolution of all the pages in a given namespace + * + * @param int $namespace Limit the query to this namespace + * + * @return resource + */ function getPageRes( $namespace ) { $fname = 'GenerateSitemap::getPageRes'; @@ -111,6 +186,11 @@ class GenerateSitemap { ); } + /** + * Main loop + * + * @access public + */ function main() { global $wgDBname; @@ -123,14 +203,15 @@ class GenerateSitemap { $this->debug( $namespace ); while ( $row = $this->dbr->fetchObject( $res ) ) { - if ( $i % $this->cutoff == 0 ) { + if ( $i % $this->limit === 0 ) { if ( $this->file !== false ) { gzwrite( $this->file, $this->closeFile() ); gzclose( $this->file ); } - $filename = "{$this->path}sitemap-$wgDBname-NS_$namespace-$smcount.xml.gz"; + $this->generateLimit( $namespace ); + $filename = "sitemap-$wgDBname-NS_$namespace-$smcount.xml.gz"; ++$smcount; - $this->file = gzopen( $filename, 'wb' ); + $this->file = gzopen( $this->path . $filename, 'wb' ); gzwrite( $this->file, $this->openFile() ); fwrite( $this->findex, $this->indexEntry( $filename ) ); $this->debug( "\t$filename" ); @@ -149,35 +230,87 @@ class GenerateSitemap { fclose( $this->findex ); } + /** + * Return the XML required to open an XML file + * + * @static + * + * @return string + */ function xmlHead() { return '' . "\n"; } + /** + * Return the XML schema being used + * + * @static + * + * @returns string + */ function xmlSchema() { return 'http://www.google.com/schemas/sitemap/0.84'; } + /** + * Return the XML required to open a sitemap index file + * + * @return string + */ function openIndex() { return $this->xmlHead() . '' . "\n"; } + /** + * Return the XML for a single sitemap indexfile entry + * + * @static + * + * @param string $filename The filename of the sitemap file + * + * @return string + */ function indexEntry( $filename ) { - global $wgServer; + global $wgServer, $wgScriptPath; return "\t\n" . - "\t\t$wgServer/$filename\n" . + "\t\t$wgServer$wgScriptPath/$filename\n" . "\t\n"; } + /** + * Return the XML required to close a sitemap index file + * + * @static + * + * @return string + */ function closeIndex() { return "\n"; } - + + /** + * Return the XML required to open a sitemap file + * + * @return string + */ function openFile() { return $this->xmlHead() . '' . "\n"; } - + + /** + * Return the XML for a single sitemap entry + * + * @static + * + * @param string $url An RFC 2396 compilant URL + * @param string $date A ISO 8601 date + * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize + * + r + * @return string + */ function fileEntry( $url, $date, $priority ) { return "\t\n" . @@ -187,13 +320,42 @@ class GenerateSitemap { "\t\n"; } + /** + * Return the XML required to close sitemap file + * + * @static + * @return string + */ function closeFile() { return "\n"; } - + + /** + * Write a string to stderr followed by a UNIX newline + */ function debug( $str ) { fwrite( $this->stderr, "$str\n" ); } + + /** + * According to the sitemap specification each sitemap must contain no + * more than 50,000 urls and no more than 2^20 bytes (10MB), this + * function calculates how many urls we can have in each file assuming + * that we have the worst case of 63 four byte characters and 1 three + * byte character in the title (63*4+1*3 = 255) + */ + function generateLimit( $namespace ) { + $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" ); + + $olen = strlen( $this->openFile() ); + $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), '1.0' ) ); + $clen = strlen( $this->closeFile() ); + + for ( $i = 1, $etot = $elen; ( $olen + $clen + $etot + $elen ) <= pow( 2, 20 ); ++$i ) + $etot += $elen; + + $this->limit = $i; + } } ?>