Phpdoc comments and place holder. Part of the subpackage "maintenance", archives...
[lhc/web/wiklou.git] / maintenance / rebuildlinks.inc
1 <?php
2 /**
3 * Functions for rebuilding the link tracking tables; must
4 * be included within a script that also includes the Setup.
5 * See @see rebuildlinks.php, for example.
6 *
7 * @deprecated
8 * @todo document
9 * @package MediaWiki
10 * @subpackage Maintenance
11 */
12
13 /** */
14 die( "rebuildLinks.inc needs to be updated for the new schema\n" );
15
16 #
17
18 # Buffer this many rows before inserting them all in one sweep. More
19 # than about 1000 will probably not increase speed significantly on
20 # most setups.
21 /* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB
22
23 function rebuildLinkTables()
24 {
25 error_reporting (E_ALL);
26 global $wgLang, $wgLinkCache, $rowbuf_size;
27
28 print "This script may take several hours to complete. If you abort during that time,\n";
29 print "your wiki will be in an inconsistent state. If you are going to abort, this is\n";
30 print "the time to do it.\n\n";
31 print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
32 sleep(15);
33
34 $count = 0;
35 print "Rebuilding link tables.\n";
36
37 print "Setting AUTOCOMMIT=1\n";
38 wfQuery("SET SESSION AUTOCOMMIT=1", DB_MASTER);
39
40 print "Extracting often used data from cur (may take a few minutes)\n";
41 $sql = "CREATE TEMPORARY TABLE cur_fast SELECT cur_namespace, cur_title, cur_id FROM cur";
42 wfQuery( $sql, DB_MASTER );
43 $sql = "ALTER TABLE cur_fast ADD INDEX(cur_namespace, cur_title)";
44 wfQuery( $sql, DB_MASTER );
45
46 print "Locking tables\n";
47 $sql = "LOCK TABLES cur READ, cur_fast READ, interwiki READ, user_newtalk READ, " .
48 "links WRITE, brokenlinks WRITE, imagelinks WRITE";
49 wfQuery( $sql, DB_MASTER );
50
51
52 print "Deleting old data in links table.\n";
53 $sql = "DELETE FROM links";
54 wfQuery( $sql, DB_MASTER );
55
56 print "Deleting old data in brokenlinks table.\n";
57 $sql = "DELETE FROM brokenlinks";
58 wfQuery( $sql, DB_MASTER );
59
60 print "Deleting old data in imagelinks table.\n";
61 $sql = "DELETE FROM imagelinks";
62 wfQuery( $sql, DB_MASTER );
63
64 print "Finding number of articles to process... ";
65 $sql = "SELECT COUNT(*) as count FROM cur";
66 $res = wfQuery( $sql, DB_SLAVE );
67 $obj = wfFetchObject( $res );
68 $total = $obj->count;
69 print "$total\n";
70
71 print "Finding highest article id\n";
72 $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
73 $res = wfQuery( $sql, DB_SLAVE );
74 $obj = wfFetchObject( $res );
75
76 $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
77 "FROM cur WHERE cur_id ",
78 $obj->min, $obj->max, 100);
79
80 $brokenlinks_inserter = new InsertBuffer(
81 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
82
83 $links_inserter = new InsertBuffer(
84 "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
85
86 $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ".
87 "(il_from,il_to) VALUES ", $rowbuf_size);
88
89 print "Starting processing\n";
90
91 $ins = $wgLang->getNsText( Namespace::getImage() );
92 $inslen = strlen($ins)+1;
93
94 $tc = Title::legalChars();
95
96 $titleCache = new MRUCache( 10000 );
97 $titlecount = 0;
98 $start_time = time();
99
100 while ( $row = $cur_pulser->next() ) {
101
102 $from_id = intval($row->cur_id);
103 $ns = $wgLang->getNsText( $row->cur_namespace );
104 $from_full_title = $row->cur_title;
105 if ( "" != $ns ) {
106 $from_full_title = "$ns:{$from_full_title}";
107 }
108 $from_full_title_with_slashes = addslashes( $from_full_title );
109 $text = $row->cur_text;
110
111 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
112 $m, PREG_PATTERN_ORDER );
113
114 $seen_dbtitles = array(); // seen links (normalized and with ns, see below)
115 $titles_ready_for_insertion = array();
116 $titles_needing_curdata = array();
117 $titles_needing_curdata_pos = array();
118 $links_corresponding_to_titles = array();
119
120 for ( $i = 0 ; $i < $numlinks; ++$i ) {
121 $link = $m[1][$i];
122 if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
123 # an URL link; not for us!
124 continue;
125 }
126
127 # FIXME: Handle subpage links
128 $nt = $titleCache->get( $link );
129 if( $nt != false ){
130 // Only process each unique link once per page
131 $nt_key = $nt->getDBkey() . $nt->getNamespace();
132 if( isset( $seen_dbtitles[$nt_key] ) )
133 continue;
134 $seen_dbtitles[$nt_key] = 1;
135
136 $titles_ready_for_insertion[] = $nt;
137 } else {
138 $nt = Title::newFromText( $link );
139 if (! $nt) {
140 // Invalid link, probably something like "[[ ]]"
141 continue;
142 }
143
144 // Only process each unique link once per page
145 $nt_key = $nt->getDBkey() . $nt->getNamespace();
146 if( isset( $seen_dbtitles[$nt_key] ) )
147 continue;
148 $seen_dbtitles[$nt_key] = 1;
149
150 if( $nt->getInterwiki() != "" ) {
151 # Interwiki links are not stored in the link tables
152 continue;
153 }
154 if( $nt->getNamespace() == Namespace::getSpecial() ) {
155 # Special links not stored in link tables
156 continue;
157 }
158 if( $nt->getNamespace() == Namespace::getMedia() ) {
159 # treat media: links as image: links
160 $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
161 }
162 $nt->mArticleID = 0; // assume broken link until proven otherwise
163
164 $pos = array_push($titles_needing_curdata, $nt) - 1;
165 $titles_needing_curdata_pos[$nt->getDBkey() . $nt->getNamespace()] = $pos;
166 $links_corresponding_to_titles[] = $link;
167 unset( $link ); // useless outside this loop, but tempting
168 }
169 }
170
171
172 if ( count( $titles_needing_curdata ) > 0 ){
173 $parts = array();
174 foreach ($titles_needing_curdata as $nt ) {
175 $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " .
176 "cur_title='" . wfStrencode( $nt->getDBkey() ) . "')";
177 }
178 $sql = "SELECT cur_namespace, cur_title, cur_id FROM cur_fast WHERE " .
179 implode(" OR ", $parts);
180 $res = wfQuery( $sql, DB_MASTER );
181 while($row = wfFetchObject( $res ) ){
182 $pos = $titles_needing_curdata_pos[$row->cur_title . $row->cur_namespace];
183 $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id);
184 }
185 for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) {
186 $tmplink = $links_corresponding_to_titles[$k];
187 $titleCache->set( $tmplink, $titles_needing_curdata[$k] );
188 $titles_ready_for_insertion[] = $titles_needing_curdata[$k];
189 }
190 }
191
192 foreach ( $titles_ready_for_insertion as $nt ) {
193 $dest_noslashes = $nt->getPrefixedDBkey();
194 $dest = addslashes( $dest_noslashes );
195 $dest_id = $nt->getArticleID();
196 $from = $from_full_title_with_slashes;
197
198 # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n";
199
200 if ( 0 == strncmp( "$ins:", $dest_noslashes, $inslen ) ) {
201 $iname = addslashes( substr( $dest_noslashes, $inslen ) );
202 $imagelinks_inserter->insert( "('{$from}','{$iname}')" );
203 } else if ( 0 == $dest_id ) {
204 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
205 } else {
206 $links_inserter->insert( "('{$from}',{$dest_id})" );
207 }
208 $titlecount++;
209 }
210
211 if ( ( $count % 20 ) == 0 )
212 print ".";
213
214 if ( ( ++$count % 1000 ) == 0 ) {
215 $dt = time() - $start_time;
216 $start_time = time();
217 $rps = persec(1000, $dt);
218 $tps = persec($titlecount, $dt);
219 $titlecount = 0;
220 print "\n$count of $total articles scanned ({$rps} articles ".
221 "and {$tps} titles per second)\n";
222 print "Title cache hits: " . $titleCache->getPerformance() . "%\n";
223
224 }
225
226 }
227
228 print "\nFlushing insertion buffers...";
229 $imagelinks_inserter->flush();
230 $links_inserter->flush();
231 $brokenlinks_inserter->flush();
232 print "ok\n";
233
234 print "$count articles scanned.\n";
235
236 $sql = "UNLOCK TABLES";
237 wfQuery( $sql, DB_MASTER );
238 print "Done\n";
239 }
240
241 /* private */ function persec($n, $t){
242 if($n == 0)
243 return "zero";
244 if($t == 0)
245 return "lots of";
246 return intval($n/$t);
247 }
248
249 /**
250 * InsertBuffer increases performance slightly by inserting many rows
251 * at once. The gain is small (<5%) when running against a local, idle
252 * database, but may be significant in other circumstances. It also
253 * limits the number of inserted rows uppwards, which should avoid
254 * problems with huge articles and certain mysql settings that limits
255 * the size of queries. It's also convenient.
256 *
257 * @deprecated
258 * @package MediaWiki
259 * @subpackage Maintenance
260 */
261 class InsertBuffer {
262 /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize;
263
264 function InsertBuffer( $sql, $bufsize ){
265 $this->mSql = $sql;
266 $this->mBuf = array();
267 $this->mBufcount = 0;
268 $this->mMaxsize = $bufsize;
269 }
270
271 function insert( $value ){
272 // print $this->mSql . " -> " . $value . "\n";
273 $this->mBuf[] = $value;
274 $this->mBufcount++;
275 if($this->mBufcount > $this->mMaxsize){
276 $this->flush();
277 }
278 }
279
280 function flush(){
281 if( $this->mBufcount > 0 ){
282 $sql = $this->mSql . implode(",", $this->mBuf);
283 wfQuery( $sql, DB_MASTER );
284 $this->mBuf = array();
285 $this->mBufcount = 0;
286 // print "Wrote query of size " . strlen( $sql ) . "\n";
287 }
288 }
289
290 }
291
292 /**
293 * Select parts from a large table by using the "BETWEEN X AND Y"
294 * operator on the id column. Avoids buffering the whole thing in
295 * RAM. It's also convenient.
296 *
297 * @deprecated
298 * @package MediaWiki
299 * @subpackage Maintenance
300 */
301 class SelectPulser {
302 /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
303
304 function SelectPulser( $sql, $min, $max, $setsize) {
305 $this->mSql = $sql;
306 $this->mSet = array();
307 $this->mPos = $min;
308 $this->mMax = $max;
309 $this->mSetsize = $setsize;
310 }
311
312 function next(){
313 $result = current( $this->mSet );
314 next( $this->mSet );
315 if( false !== $result ){
316 return $result;
317 }
318 while( $this->mPos <= $this->mMax ){
319 $this->mSet = array();
320 $sql = $this->mSql . " BETWEEN " . $this->mPos .
321 " AND " . ($this->mPos + $this->mSetsize - 1);
322 $this->mPos += $this->mSetsize;
323
324 $res = wfQuery( $sql, DB_SLAVE );
325 while ( $row = wfFetchObject( $res ) ) {
326 $this->mSet[] = $row;
327 }
328 wfFreeResult( $res );
329 if( count( $this->mSet ) > 0 ){
330 return $this->next();
331 }
332 }
333 return false;
334 }
335 }
336
337 /**
338 * A simple MRU for general cacheing.
339 * @deprecated
340 * @todo document
341 * @package MediaWiki
342 * @subpackage Maintenance
343 */
344 class MRUCache {
345 /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti;
346 /* private */ var $hits, $misses;
347
348 function MRUCache( $size, $purgefreq = -1 ) {
349 // purgefreq is 1/10 of $size if not stated
350 $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq);
351 $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq);
352
353 $this->mSize = $size;
354 $this->mMru = array();
355 $this->mCache = array();
356 $this->mPurgefreq = $purgefreq;
357 $this->nexti = 1;
358 print "purgefreq = " . $this->mPurgefreq . "\n";
359 }
360
361 function get( $key ){
362 if ( ! array_key_exists( $key, $this->mCache) ){
363 $this->misses++;
364 return false;
365 }
366 $this->hits++;
367 $this->mMru[$key] = $this->nexti++;
368 return $this->mCache[$key];
369 }
370
371 function set( $key, $value ){
372 $this->mMru[$key] = $this->nexti++;
373 $this->mCache[$key] = $value;
374
375 if($this->nexti % $this->mPurgefreq == 0)
376 $this->purge();
377 }
378
379 function purge(){
380 $to_remove = count( $this->mMru ) - $this->mSize;
381 if( $to_remove <= 0 ){
382 return;
383 }
384 asort( $this->mMru );
385 $removed = array_splice( $this->mMru, 0, $to_remove );
386 foreach( array_keys( $removed ) as $key ){
387 unset( $this->mCache[$key] );
388 }
389 }
390
391 function getPerformance(){
392 $tot = $this->hits + $this->misses;
393 if($tot > 0)
394 return intval(100.0 * $this->hits / $tot);
395 else
396 return 0;
397 }
398 }
399
400 ?>