Speed improvements
[lhc/web/wiklou.git] / maintenance / rebuildlinks.inc
1 <?
2
3 # Functions for rebuilding the link tracking tables; must
4 # be included within a script that also includes the Setup.
5 # See rebuildlinks.php, for example.
6 #
7
8 # Turn this on if you've got memory to burn
9 $wgUseMemoryTables = false;
10
11 # Buffer this many rows before inserting them all in one sweep. More
12 # than about 1000 will probably not increase speed significantly on
13 # most setups.
14 /* private */ $rowbuf_size = 1000; // 1000 rows ~40 kB
15
16 function rebuildLinkTables()
17 {
18 error_reporting (E_ALL);
19 global $wgLang, $wgUseMemoryTables, $wgLinkCache, $rowbuf_size;
20
21 print "This script may take several hours to complete. If you abort during that time,\n";
22 print "your wiki will be in an inconsistent state. If you are going to abort, this is\n";
23 print "the time to do it.\n\n";
24 print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
25 sleep(15);
26
27 $count = 0;
28 print "Rebuilding link tables.\n";
29
30 print "Setting AUTOCOMMIT=1\n";
31 wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE);
32
33 print "Locking tables\n";
34 $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " .
35 "links WRITE, brokenlinks WRITE, imagelinks WRITE";
36 wfQuery( $sql, DB_WRITE );
37
38 print "Deleting old data in links table.\n";
39 $sql = "DELETE FROM links";
40 wfQuery( $sql, DB_WRITE );
41
42 print "Deleting old data in brokenlinks table.\n";
43 $sql = "DELETE FROM brokenlinks";
44 wfQuery( $sql, DB_WRITE );
45
46 print "Deleting old data in imagelinks table.\n";
47 $sql = "DELETE FROM imagelinks";
48 wfQuery( $sql, DB_WRITE );
49
50 print "Finding number of articles to process... ";
51 $sql = "SELECT COUNT(*) as count FROM cur";
52 $res = wfQuery( $sql, DB_READ );
53 $obj = wfFetchObject( $res );
54 $total = $obj->count;
55 print "$total\n";
56
57 print "Finding highest article id\n";
58 $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
59 $res = wfQuery( $sql, DB_READ );
60 $obj = wfFetchObject( $res );
61
62 $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
63 "FROM cur WHERE cur_id ",
64 $obj->min, $obj->max, 100);
65
66 $brokenlinks_inserter = new InsertBuffer(
67 "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
68
69 $links_inserter = new InsertBuffer(
70 "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
71
72 $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ".
73 "(il_from,il_to) VALUES ", $rowbuf_size);
74
75 print "Starting processing\n";
76
77 $ins = $wgLang->getNsText( Namespace::getImage() );
78 $inslen = strlen($ins)+1;
79
80 $tc = Title::legalChars();
81
82 $titleCache = new MRUCache( 10000 );
83 $titlecount = 0;
84 $start_time = time();
85
86 while ( $row = $cur_pulser->next() ) {
87
88 $from_id = intval($row->cur_id);
89 $ns = $wgLang->getNsText( $row->cur_namespace );
90 $from_full_title = $row->cur_title;
91 if ( "" != $ns ) {
92 $from_full_title = "$ns:{$from_full_title}";
93 }
94 $from_full_title_with_slashes = addslashes( $from_full_title );
95 $text = $row->cur_text;
96
97 $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
98 $m, PREG_PATTERN_ORDER );
99
100 $seen_links = array(); // seen links in this article
101 $titles_ready_for_insertion = array();
102 $titles_needing_curdata = array();
103 $titles_needing_curdata_pos = array();
104 $links_corresponding_to_titles = array();
105
106 for ( $i = 0 ; $i < $numlinks; ++$i ) {
107 $link = $m[1][$i];
108
109 // We're only interested in the link once per article
110 if( isset( $seen_links[$link] ) )
111 continue;
112 $seen_links[$link] = 1;
113
114 if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
115 # an URL link; not for us!
116 continue;
117 }
118
119 # FIXME: Handle subpage links
120 $nt = $titleCache->get( $link );
121 if( $nt != false ){
122 $titles_ready_for_insertion[] = $nt;
123 } else {
124 $nt = Title::newFromText( $link );
125 if (! $nt) {
126 print "\nerror in '$ns:{$from_full_title}': '$link'\n";
127 continue;
128 }
129 if( $nt->getInterwiki() != "" ) {
130 # Interwiki links are not stored in the link tables
131 continue;
132 }
133 if( $nt->getNamespace() == Namespace::getSpecial() ) {
134 # Special links not stored in link tables
135 continue;
136 }
137 if( $nt->getNamespace() == Namespace::getMedia() ) {
138 # treat media: links as image: links
139 $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
140 }
141 $nt->mArticleID = 0; // assume broken link until proven otherwise
142
143 $pos = array_push($titles_needing_curdata, $nt) - 1;
144 $titles_needing_curdata_pos[$nt->getDBkey()] = $pos;
145 $links_corresponding_to_titles[] = $link;
146 unset( $link ); // useless outside this loop, but tempting
147 }
148 }
149
150
151 if ( count( $titles_needing_curdata ) > 0 ){
152 $parts = array();
153 foreach ($titles_needing_curdata as $nt ) {
154 $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " .
155 "cur_title='" . wfStrencode( $nt->getDBkey() ) . "' AND ".
156 "cur_namespace=" . intval( $nt->getNamespace() ) . ")";
157 }
158 $sql = "SELECT cur_title, cur_id FROM cur WHERE " . implode(" OR ", $parts);
159 $res = wfQuery( $sql, DB_WRITE );
160 while($row = wfFetchObject( $res ) ){
161 $pos = $titles_needing_curdata_pos[$row->cur_title];
162 $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id);
163 }
164 for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) {
165 $tmplink = $links_corresponding_to_titles[$k];
166 $titleCache->set( $tmplink, $titles_needing_curdata[$k] );
167 $titles_ready_for_insertion[] = $titles_needing_curdata[$k];
168 }
169 }
170
171 foreach ( $titles_ready_for_insertion as $nt ) {
172 $dest = addslashes( $nt->getPrefixedDBkey() );
173 $dest_id = $nt->getArticleID();
174 $from = $from_full_title_with_slashes;
175
176 # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n";
177 if ( 0 == strncmp( "$ins:", $from_full_title, $inslen ) ) {
178 $iname = addslashes( substr( $from_full_title, $inslen ) );
179 $imagelinks_inserter->insert( "('{$from}','{$iname}')" );
180 } else if ( 0 == $dest_id ) {
181 $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
182 } else {
183 $links_inserter->insert( "('{$from}',{$dest_id})" );
184 }
185 $titlecount++;
186 }
187
188 if ( ( $count % 20 ) == 0 )
189 print ".";
190
191 if ( ( ++$count % 1000 ) == 0 ) {
192 $dt = time() - $start_time;
193 $start_time = time();
194 $rps = persec(1000, $dt);
195 $tps = persec($titlecount, $dt);
196 $titlecount = 0;
197 print "\n$count of $total articles scanned ({$rps} articles ".
198 "and {$tps} titles per second)\n";
199 print "Title cache hits: " . $titleCache->getPerformance() . "%\n";
200
201 }
202
203 }
204
205 print "\nFlushing insertion buffers...";
206 $imagelinks_inserter->flush();
207 $links_inserter->flush();
208 $brokenlinks_inserter->flush();
209 print "ok\n";
210
211 print "$count articles scanned.\n";
212
213 $sql = "UNLOCK TABLES";
214 wfQuery( $sql, DB_WRITE );
215 print "Done\n";
216 }
217
218 /* private */ function persec($n, $t){
219 if($n == 0)
220 return "zero";
221 if($t == 0)
222 return "lots of";
223 return intval($n/$t);
224 }
225
226 # InsertBuffer increases performance slightly by inserting many rows
227 # at once. The gain is small (<5%) when running against a local, idle
228 # database, but may be significant in other circumstances. It also
229 # limits the number of inserted rows uppwards, which should avoid
230 # problems with huge articles and certain mysql settings that limits
231 # the size of queries. It's also convenient.
232
233 class InsertBuffer {
234 /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize;
235
236 function InsertBuffer( $sql, $bufsize ){
237 $this->mSql = $sql;
238 $this->mBuf = array();
239 $this->mBufcount = 0;
240 $this->mMaxsize = $bufsize;
241 }
242
243 function insert( $value ){
244 // print $this->mSql . " -> " . $value . "\n";
245 $this->mBuf[] = $value;
246 $this->mBufcount++;
247 if($this->mBufcount > $this->mMaxsize){
248 $this->flush();
249 }
250 }
251
252 function flush(){
253 if( $this->mBufcount > 0 ){
254 $sql = $this->mSql . implode(",", $this->mBuf);
255 wfQuery( $sql, DB_WRITE );
256 $this->mBuf = array();
257 $this->mBufcount = 0;
258 // print "Wrote query of size " . strlen( $sql ) . "\n";
259 }
260 }
261
262 }
263
264 # Select parts from a large table by using the "BETWEEN X AND Y"
265 # operator on the id column. Avoids buffering the whole thing in
266 # RAM. It's also convenient.
267
268 class SelectPulser {
269 /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
270
271 function SelectPulser( $sql, $min, $max, $setsize) {
272 $this->mSql = $sql;
273 $this->mSet = array();
274 $this->mPos = $min;
275 $this->mMax = $max;
276 $this->mSetsize = $setsize;
277 }
278
279 function next(){
280 $result = current( $this->mSet );
281 next( $this->mSet );
282 if( false !== $result ){
283 return $result;
284 }
285 while( $this->mPos <= $this->mMax ){
286 $this->mSet = array();
287 $sql = $this->mSql . " BETWEEN " . $this->mPos .
288 " AND " . ($this->mPos + $this->mSetsize - 1);
289 $this->mPos += $this->mSetsize;
290
291 $res = wfQuery( $sql, DB_READ );
292 while ( $row = wfFetchObject( $res ) ) {
293 $this->mSet[] = $row;
294 }
295 wfFreeResult( $res );
296 if( count( $this->mSet ) > 0 ){
297 return $this->next();
298 }
299 }
300 return false;
301 }
302 }
303
304 # A simple MRU for general cacheing.
305
306 class MRUCache {
307 /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti;
308 /* private */ var $hits, $misses;
309
310 function MRUCache( $size, $purgefreq = -1 ) {
311 // purgefreq is 1/10 of $size if not stated
312 $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq);
313 $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq);
314
315 $this->mSize = $size;
316 $this->mMru = array();
317 $this->mCache = array();
318 $this->mPurgefreq = $purgefreq;
319 $this->nexti = 1;
320 print "purgefreq = " . $this->mPurgefreq . "\n";
321 }
322
323 function get( $key ){
324 if ( ! array_key_exists( $key, $this->mCache) ){
325 $this->misses++;
326 return false;
327 }
328 $this->hits++;
329 $this->mMru[$key] = $this->nexti++;
330 return $this->mCache[$key];
331 }
332
333 function set( $key, $value ){
334 $this->mMru[$key] = $this->nexti++;
335 $this->mCache[$key] = $value;
336
337 if($this->nexti % $this->mPurgefreq == 0)
338 $this->purge();
339 }
340
341 function purge(){
342 $to_remove = count( $this->mMru ) - $this->mSize;
343 if( $to_remove <= 0 ){
344 return;
345 }
346 asort( $this->mMru );
347 $removed = array_splice( $this->mMru, 0, $to_remove );
348 foreach( array_keys( $removed ) as $key ){
349 unset( $this->mCache[$key] );
350 }
351 }
352
353 function getPerformance(){
354 $tot = $this->hits + $this->misses;
355 if($tot > 0)
356 return intval(100.0 * $this->hits / $tot);
357 else
358 return 0;
359 }
360 }
361
362 ?>