Fix namespace bug resulting in serious data corruption; dump an obsolete global,...
[lhc/web/wiklou.git] / maintenance / rebuildlinks.inc
1 <?
2
3 # Functions for rebuilding the link tracking tables; must
4 # be included within a script that also includes the Setup.
5 # See rebuildlinks.php, for example.
6 #
7
8 # Buffer this many rows before inserting them all in one sweep. More
9 # than about 1000 will probably not increase speed significantly on
10 # most setups.
11 /* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB
12
13 function rebuildLinkTables()
14 {
15 error_reporting (E_ALL);
16 global $wgLang, $wgLinkCache, $rowbuf_size;
17
18 print "This script may take several hours to complete. If you abort during that time,\n";
19 print "your wiki will be in an inconsistent state. If you are going to abort, this is\n";
20 print "the time to do it.\n\n";
21 print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
22 sleep(15);
23
24 $count = 0;
25 print "Rebuilding link tables.\n";
26
27 print "Setting AUTOCOMMIT=1\n";
28 wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE);
29
30 print "Locking tables\n";
31 $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " .
32 "links WRITE, brokenlinks WRITE, imagelinks WRITE";
33 wfQuery( $sql, DB_WRITE );
34
35 print "Deleting old data in links table.\n";
36 $sql = "DELETE FROM links";
37 wfQuery( $sql, DB_WRITE );
38
39 print "Deleting old data in brokenlinks table.\n";
40 $sql = "DELETE FROM brokenlinks";
41 wfQuery( $sql, DB_WRITE );
42
43 print "Deleting old data in imagelinks table.\n";
44 $sql = "DELETE FROM imagelinks";
45 wfQuery( $sql, DB_WRITE );
46
47 print "Finding number of articles to process... ";
48 $sql = "SELECT COUNT(*) as count FROM cur";
49 $res = wfQuery( $sql, DB_READ );
50 $obj = wfFetchObject( $res );
51 $total = $obj->count;
52 print "$total\n";
53
54 print "Finding highest article id\n";
55 $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
56 $res = wfQuery( $sql, DB_READ );
57 $obj = wfFetchObject( $res );
58
59 $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
60 "FROM cur WHERE cur_id ",
61 $obj->min, $obj->max, 100);
62
63 $brokenlinks_inserter = new InsertBuffer(
64 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
65
66 $links_inserter = new InsertBuffer(
67 "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
68
69 $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ".
70 "(il_from,il_to) VALUES ", $rowbuf_size);
71
72 print "Starting processing\n";
73
74 $ins = $wgLang->getNsText( Namespace::getImage() );
75 $inslen = strlen($ins)+1;
76
77 $tc = Title::legalChars();
78
79 $titleCache = new MRUCache( 10000 );
80 $titlecount = 0;
81 $start_time = time();
82
83 while ( $row = $cur_pulser->next() ) {
84
85 $from_id = intval($row->cur_id);
86 $ns = $wgLang->getNsText( $row->cur_namespace );
87 $from_full_title = $row->cur_title;
88 if ( "" != $ns ) {
89 $from_full_title = "$ns:{$from_full_title}";
90 }
91 $from_full_title_with_slashes = addslashes( $from_full_title );
92 $text = $row->cur_text;
93
94 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
95 $m, PREG_PATTERN_ORDER );
96
97 $seen_dbtitles = array(); // seen links (normalized and with ns, see below)
98 $titles_ready_for_insertion = array();
99 $titles_needing_curdata = array();
100 $titles_needing_curdata_pos = array();
101 $links_corresponding_to_titles = array();
102
103 for ( $i = 0 ; $i < $numlinks; ++$i ) {
104 $link = $m[1][$i];
105 if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
106 # an URL link; not for us!
107 continue;
108 }
109
110 # FIXME: Handle subpage links
111 $nt = $titleCache->get( $link );
112 if( $nt != false ){
113 // Only process each unique link once per page
114 $nt_key = $nt->getDBkey() . $nt->getNamespace();
115 if( isset( $seen_dbtitles[$nt_key] ) )
116 continue;
117 $seen_dbtitles[$nt_key] = 1;
118
119 $titles_ready_for_insertion[] = $nt;
120 } else {
121 $nt = Title::newFromText( $link );
122 if (! $nt) {
123 print "\nInvalid link in page '{$from_full_title}': '$link'\n";
124 continue;
125 }
126
127 // Only process each unique link once per page
128 $nt_key = $nt->getDBkey() . $nt->getNamespace();
129 if( isset( $seen_dbtitles[$nt_key] ) )
130 continue;
131 $seen_dbtitles[$nt_key] = 1;
132
133 if( $nt->getInterwiki() != "" ) {
134 # Interwiki links are not stored in the link tables
135 continue;
136 }
137 if( $nt->getNamespace() == Namespace::getSpecial() ) {
138 # Special links not stored in link tables
139 continue;
140 }
141 if( $nt->getNamespace() == Namespace::getMedia() ) {
142 # treat media: links as image: links
143 $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
144 }
145 $nt->mArticleID = 0; // assume broken link until proven otherwise
146
147 $pos = array_push($titles_needing_curdata, $nt) - 1;
148 $titles_needing_curdata_pos[$nt->getDBkey() . $nt->getNamespace()] = $pos;
149 $links_corresponding_to_titles[] = $link;
150 unset( $link ); // useless outside this loop, but tempting
151 }
152 }
153
154
155 if ( count( $titles_needing_curdata ) > 0 ){
156 $parts = array();
157 foreach ($titles_needing_curdata as $nt ) {
158 $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " .
159 "cur_title='" . wfStrencode( $nt->getDBkey() ) . "')";
160 }
161 $sql = "SELECT cur_namespace, cur_title, cur_id FROM cur WHERE " . implode(" OR ", $parts);
162 $res = wfQuery( $sql, DB_WRITE );
163 while($row = wfFetchObject( $res ) ){
164 $pos = $titles_needing_curdata_pos[$row->cur_title . $row->cur_namespace];
165 $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id);
166 }
167 for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) {
168 $tmplink = $links_corresponding_to_titles[$k];
169 $titleCache->set( $tmplink, $titles_needing_curdata[$k] );
170 $titles_ready_for_insertion[] = $titles_needing_curdata[$k];
171 }
172 }
173
174 foreach ( $titles_ready_for_insertion as $nt ) {
175 $dest_noslashes = $nt->getPrefixedDBkey();
176 $dest = addslashes( $dest_noslashes );
177 $dest_id = $nt->getArticleID();
178 $from = $from_full_title_with_slashes;
179
180 # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n";
181
182 if ( 0 == strncmp( "$ins:", $dest_noslashes, $inslen ) ) {
183 $iname = addslashes( substr( $dest_noslashes, $inslen ) );
184 $imagelinks_inserter->insert( "('{$from}','{$iname}')" );
185 } else if ( 0 == $dest_id ) {
186 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
187 } else {
188 $links_inserter->insert( "('{$from}',{$dest_id})" );
189 }
190 $titlecount++;
191 }
192
193 if ( ( $count % 20 ) == 0 )
194 print ".";
195
196 if ( ( ++$count % 1000 ) == 0 ) {
197 $dt = time() - $start_time;
198 $start_time = time();
199 $rps = persec(1000, $dt);
200 $tps = persec($titlecount, $dt);
201 $titlecount = 0;
202 print "\n$count of $total articles scanned ({$rps} articles ".
203 "and {$tps} titles per second)\n";
204 print "Title cache hits: " . $titleCache->getPerformance() . "%\n";
205
206 }
207
208 }
209
210 print "\nFlushing insertion buffers...";
211 $imagelinks_inserter->flush();
212 $links_inserter->flush();
213 $brokenlinks_inserter->flush();
214 print "ok\n";
215
216 print "$count articles scanned.\n";
217
218 $sql = "UNLOCK TABLES";
219 wfQuery( $sql, DB_WRITE );
220 print "Done\n";
221 }
222
223 /* private */ function persec($n, $t){
224 if($n == 0)
225 return "zero";
226 if($t == 0)
227 return "lots of";
228 return intval($n/$t);
229 }
230
231 # InsertBuffer increases performance slightly by inserting many rows
232 # at once. The gain is small (<5%) when running against a local, idle
233 # database, but may be significant in other circumstances. It also
234 # limits the number of inserted rows uppwards, which should avoid
235 # problems with huge articles and certain mysql settings that limits
236 # the size of queries. It's also convenient.
237
238 class InsertBuffer {
239 /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize;
240
241 function InsertBuffer( $sql, $bufsize ){
242 $this->mSql = $sql;
243 $this->mBuf = array();
244 $this->mBufcount = 0;
245 $this->mMaxsize = $bufsize;
246 }
247
248 function insert( $value ){
249 // print $this->mSql . " -> " . $value . "\n";
250 $this->mBuf[] = $value;
251 $this->mBufcount++;
252 if($this->mBufcount > $this->mMaxsize){
253 $this->flush();
254 }
255 }
256
257 function flush(){
258 if( $this->mBufcount > 0 ){
259 $sql = $this->mSql . implode(",", $this->mBuf);
260 wfQuery( $sql, DB_WRITE );
261 $this->mBuf = array();
262 $this->mBufcount = 0;
263 // print "Wrote query of size " . strlen( $sql ) . "\n";
264 }
265 }
266
267 }
268
269 # Select parts from a large table by using the "BETWEEN X AND Y"
270 # operator on the id column. Avoids buffering the whole thing in
271 # RAM. It's also convenient.
272
273 class SelectPulser {
274 /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
275
276 function SelectPulser( $sql, $min, $max, $setsize) {
277 $this->mSql = $sql;
278 $this->mSet = array();
279 $this->mPos = $min;
280 $this->mMax = $max;
281 $this->mSetsize = $setsize;
282 }
283
284 function next(){
285 $result = current( $this->mSet );
286 next( $this->mSet );
287 if( false !== $result ){
288 return $result;
289 }
290 while( $this->mPos <= $this->mMax ){
291 $this->mSet = array();
292 $sql = $this->mSql . " BETWEEN " . $this->mPos .
293 " AND " . ($this->mPos + $this->mSetsize - 1);
294 $this->mPos += $this->mSetsize;
295
296 $res = wfQuery( $sql, DB_READ );
297 while ( $row = wfFetchObject( $res ) ) {
298 $this->mSet[] = $row;
299 }
300 wfFreeResult( $res );
301 if( count( $this->mSet ) > 0 ){
302 return $this->next();
303 }
304 }
305 return false;
306 }
307 }
308
309 # A simple MRU for general cacheing.
310
311 class MRUCache {
312 /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti;
313 /* private */ var $hits, $misses;
314
315 function MRUCache( $size, $purgefreq = -1 ) {
316 // purgefreq is 1/10 of $size if not stated
317 $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq);
318 $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq);
319
320 $this->mSize = $size;
321 $this->mMru = array();
322 $this->mCache = array();
323 $this->mPurgefreq = $purgefreq;
324 $this->nexti = 1;
325 print "purgefreq = " . $this->mPurgefreq . "\n";
326 }
327
328 function get( $key ){
329 if ( ! array_key_exists( $key, $this->mCache) ){
330 $this->misses++;
331 return false;
332 }
333 $this->hits++;
334 $this->mMru[$key] = $this->nexti++;
335 return $this->mCache[$key];
336 }
337
338 function set( $key, $value ){
339 $this->mMru[$key] = $this->nexti++;
340 $this->mCache[$key] = $value;
341
342 if($this->nexti % $this->mPurgefreq == 0)
343 $this->purge();
344 }
345
346 function purge(){
347 $to_remove = count( $this->mMru ) - $this->mSize;
348 if( $to_remove <= 0 ){
349 return;
350 }
351 asort( $this->mMru );
352 $removed = array_splice( $this->mMru, 0, $to_remove );
353 foreach( array_keys( $removed ) as $key ){
354 unset( $this->mCache[$key] );
355 }
356 }
357
358 function getPerformance(){
359 $tot = $this->hits + $this->misses;
360 if($tot > 0)
361 return intval(100.0 * $this->hits / $tot);
362 else
363 return 0;
364 }
365 }
366
367 ?>