Some adjustments
authorMr. E23 <e23@users.mediawiki.org>
Mon, 24 Nov 2003 23:29:35 +0000 (23:29 +0000)
committerMr. E23 <e23@users.mediawiki.org>
Mon, 24 Nov 2003 23:29:35 +0000 (23:29 +0000)
maintenance/rebuildlinks.inc
maintenance/rebuildlinks.php

index 8be4336..253643e 100644 (file)
 # Turn this on if you've got memory to burn
 $wgUseMemoryTables = false;
 
-function rebuildLinkTablesPass1()
+# Buffer this many rows before inserting them all in one sweep. More
+# than about 1000 will probably not increase speed significantly on
+# most setups.
+/* private */ $rowbuf_size = 2000; // 2000 rows ~ 80 kB
+
+function rebuildLinkTables()
 {
-       global $wgLang, $wgUseMemoryTables;
+       global $wgLang, $wgUseMemoryTables, $rowbuf_size;
+
+       print "This script may take many hours to complete. If you abort during that time,\n";
+       print "your wiki will be in an inconsistent state and you may have problems running\n";
+       print "this script again. If you are going to abort, this is the time to do it.\n\n";
+       print "Press control-c to abort (will proceed automatically in 15 seconds)\n";
+       sleep(15);
+
        $count = 0;
-       print "Rebuilding link tables (pass 1).\n";
+       print "Rebuilding link tables.\n";
 
-       $sql = "DROP TABLE IF EXISTS rebuildlinks";
+       print "Setting AUTOCOMMIT=1\n";
+       wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE);
+
+       print "Deleting old data in links table.\n";
+       $sql = "DELETE FROM links";
        wfQuery( $sql, DB_WRITE );
 
-       $sql = "CREATE TABLE rebuildlinks (
-  rl_f_id int(8) unsigned NOT NULL default 0,
-  rl_f_title varchar(255) binary NOT NULL default '',
-  rl_to varchar(255) binary NOT NULL default '',
-  INDEX rl_to (rl_to) )";
-       if( $wgUseMemoryTables ) $sql .= " TYPE=heap";
+       print "Deleting old data in brokenlinks table.\n";
+       $sql = "DELETE FROM brokenlinks";
+       wfQuery( $sql, DB_WRITE );
+
+       print "Deleting old data in imagelinks table.\n";
+       $sql = "DELETE FROM imagelinks";
        wfQuery( $sql, DB_WRITE );
 
-       $sql = "LOCK TABLES cur READ, rebuildlinks WRITE, interwiki READ, user_newtalk READ";
+       print "\nAdding temporary unique index on links, brokenlinks and imagelinks.\n";
+       print "->If build aborts now, you probably aborted a previous build. If that is\n";
+       print "  the case, you can clean up the remains with the following SQL commands,\n";
+       print "  and then try again.\n";
+       print "  ALTER TABLE links DROP INDEX tmp_unique;\n";
+       print "  ALTER TABLE brokenlinks DROP INDEX tmp_unique;\n";
+       print "  ALTER TABLE imagelinks DROP INDEX tmp_unique;\n\n";
+
+       $sql = "ALTER TABLE links ADD UNIQUE tmp_unique (l_from, l_to)";
+       wfQuery( $sql, DB_WRITE );
+       $sql = "ALTER TABLE brokenlinks ADD UNIQUE tmp_unique (bl_from, bl_to)";
+       wfQuery( $sql, DB_WRITE );
+       $sql = "ALTER TABLE imagelinks ADD UNIQUE tmp_unique (il_from, il_to(244))";
        wfQuery( $sql, DB_WRITE );
+       print "Temporary unique index added ok. Forget what I said.\n\n";
 
-       $sql = "DELETE FROM rebuildlinks";
+       print "Locking tables\n";
+       $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " .
+               "links WRITE, brokenlinks WRITE, imagelinks WRITE";
        wfQuery( $sql, DB_WRITE );
 
-       $sql = "SELECT cur_id,cur_namespace,cur_title,cur_text FROM cur";
-       $res = wfQuery( $sql, DB_WRITE );
-       $total = wfNumRows( $res );
+       print "Finding number of articles to process\n";
+       $sql = "SELECT COUNT(*) as count FROM cur";
+       $res = wfQuery( $sql, DB_READ );
+       $obj = wfFetchObject( $res );
+       $total = $obj->count;
+
+       print "Finding highest article id\n";
+       $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur";
+       $res = wfQuery( $sql, DB_READ );
+       $obj = wfFetchObject( $res );
+
+       $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " .
+               "FROM cur WHERE cur_id ", 
+               $obj->min, $obj->max, $rowbuf_size);
+
+       $brokenlinks_inserter = new InsertBuffer(
+               "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size);
+
+       $links_inserter = new InsertBuffer(
+               "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size);
+
+       $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ".
+               "(il_from,il_to) VALUES ", $rowbuf_size);
+
+       print "Starting processing\n";
+
+       $ins = $wgLang->getNsText( Namespace::getImage() );
+       $inslen = strlen($ins)+1;
 
        $tc = Title::legalChars();
-       while ( $row = wfFetchObject( $res ) ) {
-               $id = $row->cur_id;
+       
+       $start_time = time();
+       while ( $row = $cur_pulser->next() ) {
+               $from_id = $row->cur_id;
                $ns = $wgLang->getNsText( $row->cur_namespace );
-               if ( "" == $ns ) {
-                       $title = addslashes( $row->cur_title );
-               } else {
-                       $title = addslashes( "$ns:{$row->cur_title}" );
+
+               $raw_title = $row->cur_title;
+               if ( "" != $ns ) {
+                       $raw_title = "$ns:{$raw_title}";
                }
+               $title = addslashes( $raw_title );
                $text = $row->cur_text;
+
                $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text,
                  $m, PREG_PATTERN_ORDER );
 
-               if ( 0 != $numlinks ) {
-                       $first = true;
-                       $sql = "INSERT INTO rebuildlinks (rl_f_id,rl_f_title,rl_to) VALUES ";
-                       for ( $i = 0; $i < $numlinks; ++$i ) {
-                               if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
-                                       # an URL link; not for us!
-                                       continue;
-                               }
-                               # FIXME: Handle subpage links
-                               $nt = Title::newFromText( $m[1][$i] );
-                               if (! $nt)
-                               {
-                                       $txt = $m[1][$i];
-                                       print "error in '$ns:{$row->cur_title}' :\t'$txt'\n";
-                                       continue;
-                               }
-                               if( $nt->getInterwiki() != "" ) {
-                                       # Interwiki links are not stored in the link tables
-                                       continue;
-                               }
-                               if( $nt->getNamespace() == Namespace::getSpecial() ) {
-                                       # Special links not stored in link tables
-                                       continue;
-                               }
-                               if( $nt->getNamespace() == Namespace::getMedia() ) {
-                                       # treat media: links as image: links
-                                       $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
-                               }
-
-                               if (!$first)
-                                       $sql .= ",";
-                               else
-                                       $first = false;
-
-                               $dest = addslashes( $nt->getPrefixedDBkey() );
-                               $sql .= "({$id},'{$title}','{$dest}')";
+               for ( $i = 0; $i < $numlinks; ++$i ) {
+                       if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) {
+                               # an URL link; not for us!
+                               continue;
+                       }
+
+                       # FIXME: Handle subpage links
+                       $nt = Title::newFromText( $m[1][$i] );
+                               
+                       if (! $nt)
+                       {
+                               $txt = $m[1][$i];
+                               print "error in '$ns:{$row->cur_title}' :\t'$txt'\n";
+                               continue;
+                       }
+                       if( $nt->getInterwiki() != "" ) {
+                               # Interwiki links are not stored in the link tables
+                               continue;
                        }
+                       if( $nt->getNamespace() == Namespace::getSpecial() ) {
+                               # Special links not stored in link tables
+                               continue;
+                       }
+                       if( $nt->getNamespace() == Namespace::getMedia() ) {
+                               # treat media: links as image: links
+                               $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() );
+                       }
+
+                       $dest = addslashes( $nt->getPrefixedDBkey() );
+                       $dest_id = $nt->getArticleID();
 
-                       if (! $first) { wfQuery( $sql, DB_WRITE  ); }
+                       if ( 0 == strncmp( "$ins:", $raw_title, $inslen ) ) { 
+                               $iname = addslashes( substr( $raw_title, $inslen ) );
+                               $imagelinks_inserter->insert( "('{$title}','{$iname}')" );
+                       } else if ( 0 == $dest_id ) {
+                               $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" );
+                       } else {
+                               $links_inserter->insert( "('{$title}',{$dest_id})" );
+                       }
                }
+
+               if ( ( $count % 10 ) == 0 )
+                       print "."; 
+
                if ( ( ++$count % 1000 ) == 0 ) {
-                       print "$count of $total articles scanned.\n";
+                       $dt = time() - $start_time;
+                       $start_time = time();
+                       $rps = ($dt == 0 ? "lots of" : intval(1000/$dt));
+                       print "\n$count of $total articles scanned ({$rps} articles per second)\n";
                }
+
        }
-       print "$total articles scanned.\n";
-       mysql_free_result( $res );
 
-       $sql = "UNLOCK TABLES";
-       wfQuery( $sql, DB_WRITE );
-}
+       $imagelinks_inserter->flush();
+       $links_inserter->flush();
+       $brokenlinks_inserter->flush();
 
-function rebuildLinkTablesPass2()
-{
-       global $wgLang;
-       $count = 0;
-       print "Rebuilding link tables (pass 2).\n";
+       print "$total articles scanned.\n";
 
-       $sql = "LOCK TABLES cur READ, rebuildlinks READ, interwiki READ, " .
-         "links WRITE, brokenlinks WRITE, imagelinks WRITE";
+       print "Removing temporary unique indexes from tables links, brokenlinks and imagelinks.\n";
+       $sql = "ALTER TABLE links DROP INDEX tmp_unique";
        wfQuery( $sql, DB_WRITE );
-
-       $sql = "DELETE FROM links";
+       $sql = "ALTER TABLE brokenlinks DROP INDEX tmp_unique";
        wfQuery( $sql, DB_WRITE );
-
-       $sql = "DELETE FROM brokenlinks";
+       $sql = "ALTER TABLE imagelinks DROP INDEX tmp_unique";
        wfQuery( $sql, DB_WRITE );
 
-       $sql = "DELETE FROM imagelinks";
+       $sql = "UNLOCK TABLES";
        wfQuery( $sql, DB_WRITE );
+       print "Done\n";
+}
 
-       $ins = $wgLang->getNsText( Namespace::getImage() );
-       $inslen = strlen($ins)+1;
-       $sql = "SELECT rl_f_title,rl_to FROM rebuildlinks " .
-         "WHERE rl_to LIKE '$ins:%'";
-       $res = wfQuery( $sql, DB_WRITE );
-
-       $sql = "INSERT INTO imagelinks (il_from,il_to) VALUES ";
-       $first = true;
-       while ( $row = wfFetchObject( $res ) )
-       {
-               $iname = addslashes( substr( $row->rl_to, $inslen ) );
-               $pname = addslashes( $row->rl_f_title );
-
-               if ( ! $first )
-                       $sql .= ",";
-               else
-                       $first = false;
-
-               $sql .= "('{$pname}','{$iname}')";
+# InsertBuffer increases performance slightly by inserting many rows
+# at once. The gain is small (<5%) when running against a local, idle
+# database, but may be significant in other circumstances. It also
+# limits the number of inserted rows uppwards, which should avoid
+# problems with huge articles and certain mysql settings that limits
+# the size of queries. It's also convenient.
+
+class InsertBuffer {
+       /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize;
+       
+       function InsertBuffer( $sql, $bufsize ){
+               $this->mSql = $sql;
+               $this->mBuf = array();
+               $this->mBufcount = 0;
+               $this->mMaxsize = $bufsize;
        }
-       wfFreeResult( $res );
-       if ( ! $first ) { wfQuery( $sql, DB_WRITE ); }
 
-       $sql = "SELECT DISTINCT rl_to FROM rebuildlinks ORDER BY rl_to";
-       $res = wfQuery( $sql, DB_WRITE );
-       $count = 0;
-       $total = wfNumRows( $res );
-
-       while ( $row = wfFetchObject( $res ) ) {
-               if ( 0 == strncmp( "$ins:", $row->rl_to, $inslen ) ) { continue; }
-
-               $nt = Title::newFromDBkey( $row->rl_to );
-               if (! $nt)
-               {
-                       print "error pass2: '{$row->rl_to}'\n";
-                       continue;
+       function insert( $value ){
+               // print $this->mSql . " -> " . $value . "\n";
+               $this->mBuf[] = $value;
+               $this->mBufcount++;
+               if($this->mBufcount > $this->mMaxsize){
+                       $this->flush();
                }
-               $id = $nt->getArticleID();
-               $to = addslashes( $row->rl_to );
-
-               if ( 0 == $id ) {
-                       $sql = "SELECT DISTINCT rl_f_id FROM rebuildlinks WHERE rl_to='{$to}'";
-                       $res2 = wfQuery( $sql, DB_WRITE );
+       }
 
-                       $sql = "INSERT INTO brokenlinks (bl_from,bl_to) VALUES ";
-                       $first = true;
-                       while ( $row2 = wfFetchObject( $res2 ) )
-                       {
-                               if (! $first)
-                                       $sql .= ",";
-                               else
-                                       $first = false;
+       function flush(){
+               if( $this->mBufcount > 0 ){
+                       $sql = $this->mSql . implode(",", $this->mBuf);
+                       wfQuery( $sql, DB_WRITE );
+                       $this->mBuf = array();
+                       $this->mBufcount = 0;
+                       // print "Wrote query of size " . strlen( $sql ) . "\n";
+               }
+       }
+       
+}
 
-                               $from = $row2->rl_f_id;
-                               $sql .= "({$from},'{$to}')";
+# Select parts from a large table by using the "BETWEEN X AND Y"
+# operator on the id column. Avoids buffering the whole thing in
+# RAM. It's also convenient.  
+class SelectPulser {
+       /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet;
+
+       function SelectPulser( $sql, $min, $max, $setsize) {
+               $this->mSql = $sql;
+               $this->mSet = array();
+               $this->mPos = $min;
+               $this->mMax = $max;
+               $this->mSetsize = $setsize;
+       }
+       
+       function next(){
+               $result = current( $this->mSet );
+               next( $this->mSet );
+               if( false !== $result ){
+                       return $result;
+               }
+               while( $this->mPos <= $this->mMax ){
+                       $this->mSet = array();
+                       $sql = $this->mSql . " BETWEEN " . $this->mPos .
+                               " AND " . ($this->mPos + $this->mSetsize - 1);
+                       $this->mPos += $this->mSetsize;
+
+                       $res = wfQuery( $sql, DB_READ );
+                       while ( $row = wfFetchObject( $res ) ) {
+                               $this->mSet[] = $row;
                        }
-                       wfFreeResult( $res2 );
-                       if ( ! $first ) { wfQuery( $sql, DB_WRITE ); }
-               } else {
-                       $sql = "SELECT DISTINCT rl_f_title FROM rebuildlinks WHERE rl_to='{$to}'";
-                       $res2 = wfQuery( $sql, DB_WRITE );
-
-                       $sql = "INSERT INTO links (l_from,l_to) VALUES ";
-                       $first = true;
-                       while ( $row2 = wfFetchObject( $res2 ) )
-                       {
-                               if (! $first)
-                                       $sql .= ",";
-                               else
-                                       $first = false;
 
-                               $from = addslashes( $row2->rl_f_title );
-                               $sql .= "('{$from}',{$id})";
-                       }
-                       wfFreeResult( $res2 );
-                       if ( ! $first ) { wfQuery( $sql, DB_WRITE ); }
-               }
-               if ( ( ++$count % 1000 ) == 0 ) {
-                       print "$count of $total titles processed.\n";
+                       if( count( $this->mSet ) > 0 ){
+                               return $this->next();
+                       } 
                }
+               return false;
        }
-       wfFreeResult( $res );
-
-       $sql = "UNLOCK TABLES";
-       wfQuery( $sql, DB_WRITE );
-
-       $sql = "DROP TABLE rebuildlinks";
-       wfQuery( $sql, DB_WRITE );
 }
+
 ?>
index 9405a03..8423a6a 100644 (file)
@@ -25,8 +25,7 @@ set_time_limit(0);
 $wgDBuser                      = $wgDBadminuser;
 $wgDBpassword          = $wgDBadminpassword;
 
-rebuildLinkTablesPass1();
-rebuildLinkTablesPass2();
+rebuildLinkTables();
 
 print "Done.\n";
 exit();