* Renamed --path to --fspath
[lhc/web/wiklou.git] / maintenance / generateSitemap.php
1 <?php
2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
4 /**
5 * Creates a Google sitemap for the site
6 *
7 * @package MediaWiki
8 * @subpackage Maintenance
9 *
10 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
11 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
12 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
13 *
14 * @link http://www.google.com/webmasters/sitemaps/docs/en/about.html
15 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
16 *
17 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
18 */
19
20 class GenerateSitemap {
21 /**
22 * The path to prepend to the filename
23 *
24 * @var string
25 */
26 var $fspath;
27
28 /**
29 * The path to append to the domain name
30 *
31 * @var string
32 */
33 var $path;
34
35 /**
36 * Whether or not to use compression
37 *
38 * @var bool
39 */
40 var $compress;
41
42 /**
43 * The number of entries to save in each sitemap file
44 *
45 * @var int
46 */
47 var $limit;
48
49 /**
50 * Key => value entries of namespaces and their priorities
51 *
52 * @var array
53 */
54 var $priorities = array(
55 // Custom main namespaces
56 GS_MAIN => '0.5',
57 // Custom talk namesspaces
58 GS_TALK => '0.1',
59 // MediaWiki standard namespaces
60 NS_MAIN => '1.0',
61 NS_TALK => '0.1',
62 NS_USER => '0.5',
63 NS_USER_TALK => '0.1',
64 NS_PROJECT => '0.5',
65 NS_PROJECT_TALK => '0.1',
66 NS_IMAGE => '0.5',
67 NS_IMAGE_TALK => '0.1',
68 NS_MEDIAWIKI => '0.0',
69 NS_MEDIAWIKI_TALK => '0.1',
70 NS_TEMPLATE => '0.0',
71 NS_TEMPLATE_TALK => '0.1',
72 NS_HELP => '0.5',
73 NS_HELP_TALK => '0.1',
74 NS_CATEGORY => '0.5',
75 NS_CATEGORY_TALK => '0.1',
76 );
77
78 /**
79 * A one-dimensional array of namespaces in the wiki
80 *
81 * @var array
82 */
83 var $namespaces = array();
84
85 /**
86 * When this sitemap batch was generated
87 *
88 * @var string
89 */
90 var $startts;
91
92 /**
93 * A database slave object
94 *
95 * @var object
96 */
97 var $dbr;
98
99 /**
100 * A resource pointing to the sitemap index file
101 *
102 * @var resource
103 */
104 var $findex;
105
106
107 /**
108 * A resource pointing to a sitemap file
109 *
110 * @var resource
111 */
112 var $file;
113
114 /**
115 * A resource pointing to php://stderr
116 *
117 * @var resource
118 */
119 var $stderr;
120
121 /**
122 * Constructor
123 *
124 * @param string $fspath The path to prepend to the filenames, used to
125 * save them somewhere else than in the root directory
126 * @param string $path The path to append to the domain name
127 * @param bool $compress Whether to compress the sitemap files
128 */
129 function GenerateSitemap( $fspath, $path, $compress ) {
130 global $wgDBname, $wgScriptPath;
131
132 $this->fspath = isset( $fspath ) ? $fspath : '';
133 $this->path = isset( $path ) ? $path : $wgScriptPath;
134 $this->compress = $compress;
135
136 $this->startts = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
137
138 $this->stderr = fopen( 'php://stderr', 'wt' );
139 $this->dbr =& wfGetDB( DB_SLAVE );
140 $this->generateNamespaces();
141 $this->generateLimit( NS_MAIN );
142 $this->findex = fopen( "{$this->fspath}sitemap-index-$wgDBname.xml", 'wb' );
143 }
144
145 /**
146 * Generate a one-dimensional array of existing namespaces
147 */
148 function generateNamespaces() {
149 $fname = 'GenerateSitemap::generateNamespaces';
150
151 $res = $this->dbr->select( 'page',
152 array( 'page_namespace' ),
153 array(),
154 $fname,
155 array(
156 'GROUP BY' => 'page_namespace',
157 'ORDER BY' => 'page_namespace',
158 )
159 );
160
161 while ( $row = $this->dbr->fetchObject( $res ) )
162 $this->namespaces[] = $row->page_namespace;
163 }
164
165 /**
166 * Get the priority of a given namespace
167 *
168 * @param int $namespace The namespace to get the priority for
169 +
170 * @return string
171 */
172
173 function priority( $namespace ) {
174 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
175 }
176
177 /**
178 * If the namespace isn't listed on the priority list return the
179 * default priority for the namespace, varies depending on whether it's
180 * a talkpage or not.
181 *
182 * @param int $namespace The namespace to get the priority for
183 *
184 * @return string
185 */
186 function guessPriority( $namespace ) {
187 return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
188 }
189
190 /**
191 * Return a database resolution of all the pages in a given namespace
192 *
193 * @param int $namespace Limit the query to this namespace
194 *
195 * @return resource
196 */
197 function getPageRes( $namespace ) {
198 $fname = 'GenerateSitemap::getPageRes';
199
200 return $this->dbr->select( 'page',
201 array(
202 'page_namespace',
203 'page_title',
204 'page_is_redirect',
205 'page_touched',
206 ),
207 array( 'page_namespace' => $namespace ),
208 $fname
209 );
210 }
211
212 /**
213 * Main loop
214 *
215 * @access public
216 */
217 function main() {
218 global $wgDBname;
219
220 fwrite( $this->findex, $this->openIndex() );
221
222 foreach ( $this->namespaces as $namespace ) {
223 $res = $this->getPageRes( $namespace );
224 $this->file = false;
225 $i = $smcount = 0;
226
227 $this->debug( $namespace );
228 while ( $row = $this->dbr->fetchObject( $res ) ) {
229 if ( $i % $this->limit === 0 ) {
230 if ( $this->file !== false ) {
231 $this->write( $this->file, $this->closeFile() );
232 $this->close( $this->file );
233 }
234 $this->generateLimit( $namespace );
235 $filename = $this->sitemapFilename( $namespace, $smcount++ );
236 $this->file = $this->open( $this->fspath . $filename, 'wb' );
237 $this->write( $this->file, $this->openFile() );
238 fwrite( $this->findex, $this->indexEntry( $filename ) );
239 $this->debug( "\t$filename" );
240 }
241 ++$i;
242 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
243 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
244 $this->write( $this->file, $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ) );
245 }
246 if ( $this->file ) {
247 $this->write( $this->file, $this->closeFile() );
248 $this->close( $this->file );
249 }
250 }
251 fwrite( $this->findex, $this->closeIndex() );
252 fclose( $this->findex );
253 }
254
255 /**
256 * gzopen() / fopen() wrapper
257 *
258 * @return resource
259 */
260 function open( $file, $flags ) {
261 return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
262 }
263
264 /**
265 * gzwrite() / fwrite() wrapper
266 */
267 function write( &$handle, $str ) {
268 if ( $this->compress )
269 gzwrite( $handle, $str );
270 else
271 fwrite( $handle, $str );
272 }
273
274 /**
275 * gzclose() / fclose() wrapper
276 */
277 function close( &$handle ) {
278 if ( $this->compress )
279 gzclose( $handle );
280 else
281 fclose( $handle );
282 }
283
284 /**
285 * Get a sitemap filename
286 *
287 * @static
288 *
289 * @param int $namespace The namespace
290 * @param int $count The count
291 *
292 * @return string
293 */
294 function sitemapFilename( $namespace, $count ) {
295 global $wgDBname;
296
297 $ext = $this->compress ? '.gz' : '';
298
299 return "sitemap-$wgDBname-NS_$namespace-$count.xml$ext";
300 }
301
302 /**
303 * Return the XML required to open an XML file
304 *
305 * @static
306 *
307 * @return string
308 */
309 function xmlHead() {
310 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
311 }
312
313 /**
314 * Return the XML schema being used
315 *
316 * @static
317 *
318 * @returns string
319 */
320 function xmlSchema() {
321 return 'http://www.google.com/schemas/sitemap/0.84';
322 }
323
324 /**
325 * Return the XML required to open a sitemap index file
326 *
327 * @return string
328 */
329 function openIndex() {
330 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
331 }
332
333 /**
334 * Return the XML for a single sitemap indexfile entry
335 *
336 * @static
337 *
338 * @param string $filename The filename of the sitemap file
339 *
340 * @return string
341 */
342 function indexEntry( $filename ) {
343 global $wgServer;
344
345 return
346 "\t<sitemap>\n" .
347 "\t\t<loc>$wgServer{$this->path}/$filename</log>\n" .
348 "\t\t<lastmod>{$this->startts}</lastmod>\n" .
349 "\t</sitemap>\n";
350 }
351
352 /**
353 * Return the XML required to close a sitemap index file
354 *
355 * @static
356 *
357 * @return string
358 */
359 function closeIndex() {
360 return "</sitemapindex>\n";
361 }
362
363 /**
364 * Return the XML required to open a sitemap file
365 *
366 * @return string
367 */
368 function openFile() {
369 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
370 }
371
372 /**
373 * Return the XML for a single sitemap entry
374 *
375 * @static
376 *
377 * @param string $url An RFC 2396 compilant URL
378 * @param string $date A ISO 8601 date
379 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
380 *
381 * @return string
382 */
383 function fileEntry( $url, $date, $priority ) {
384 return
385 "\t<url>\n" .
386 "\t\t<loc>$url</loc>\n" .
387 "\t\t<lastmod>$date</lastmod>\n" .
388 "\t\t<priority>$priority</priority>\n" .
389 "\t</url>\n";
390 }
391
392 /**
393 * Return the XML required to close sitemap file
394 *
395 * @static
396 * @return string
397 */
398 function closeFile() {
399 return "</urlset>\n";
400 }
401
402 /**
403 * Write a string to stderr followed by a UNIX newline
404 */
405 function debug( $str ) {
406 fwrite( $this->stderr, "$str\n" );
407 }
408
409 /**
410 * According to the sitemap specification each sitemap must contain no
411 * more than 50,000 urls and no more than 2^20 bytes (10MB), this
412 * function calculates how many urls we can have in each file assuming
413 * that we have the worst case of 63 four byte characters and 1 three
414 * byte character in the title (63*4+1*3 = 255)
415 */
416 function generateLimit( $namespace ) {
417 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
418
419 $olen = strlen( $this->openFile() );
420 $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), '1.0' ) );
421 $clen = strlen( $this->closeFile() );
422
423 for ( $i = 1, $etot = $elen; ( $olen + $clen + $etot + $elen ) <= pow( 2, 20 ); ++$i )
424 $etot += $elen;
425
426 $this->limit = $i;
427 }
428 }
429
430 if ( in_array( '--help', $argv ) )
431 die(
432 "Usage: php generateSitemap.php [host] [options]\n" .
433 "\thost = hostname\n" .
434 "\toptions:\n" .
435 "\t\t--help\tshow this message\n" .
436 "\t\t--fspath\tThe file system path to save to, e.g /tmp/sitemap/\n" .
437 "\t\t--path\tThe http path to use, e.g. /wiki\n" .
438 "\t\t--compress=[yes|no]\tcompress the sitemap files, default yes\n"
439 );
440
441 if ( isset( $argv[1] ) && strpos( $argv[1], '--' ) !== 0 )
442 $_SERVER['SERVER_NAME'] = $argv[1];
443
444 $optionsWithArgs = array( 'fspath', 'path', 'compress' );
445 require_once 'commandLine.inc';
446
447 $gs = new GenerateSitemap( @$options['fspath'], @$options['path'], @$options['compress'] !== 'no' );
448 $gs->main();
449 ?>