From: Mr. E23 Date: Mon, 24 Nov 2003 23:29:35 +0000 (+0000) Subject: Some adjustments X-Git-Tag: 1.1.0~91 X-Git-Url: https://git.cyclocoop.org/admin/?a=commitdiff_plain;h=993f12ced92e9dc7114c3bbf502eade7deab2bf1;p=lhc%2Fweb%2Fwiklou.git Some adjustments --- diff --git a/maintenance/rebuildlinks.inc b/maintenance/rebuildlinks.inc index 8be43362ac..253643ebae 100644 --- a/maintenance/rebuildlinks.inc +++ b/maintenance/rebuildlinks.inc @@ -8,203 +8,253 @@ # Turn this on if you've got memory to burn $wgUseMemoryTables = false; -function rebuildLinkTablesPass1() +# Buffer this many rows before inserting them all in one sweep. More +# than about 1000 will probably not increase speed significantly on +# most setups. +/* private */ $rowbuf_size = 2000; // 2000 rows ~ 80 kB + +function rebuildLinkTables() { - global $wgLang, $wgUseMemoryTables; + global $wgLang, $wgUseMemoryTables, $rowbuf_size; + + print "This script may take many hours to complete. If you abort during that time,\n"; + print "your wiki will be in an inconsistent state and you may have problems running\n"; + print "this script again. If you are going to abort, this is the time to do it.\n\n"; + print "Press control-c to abort (will proceed automatically in 15 seconds)\n"; + sleep(15); + $count = 0; - print "Rebuilding link tables (pass 1).\n"; + print "Rebuilding link tables.\n"; - $sql = "DROP TABLE IF EXISTS rebuildlinks"; + print "Setting AUTOCOMMIT=1\n"; + wfQuery("SET SESSION AUTOCOMMIT=1", DB_WRITE); + + print "Deleting old data in links table.\n"; + $sql = "DELETE FROM links"; wfQuery( $sql, DB_WRITE ); - $sql = "CREATE TABLE rebuildlinks ( - rl_f_id int(8) unsigned NOT NULL default 0, - rl_f_title varchar(255) binary NOT NULL default '', - rl_to varchar(255) binary NOT NULL default '', - INDEX rl_to (rl_to) )"; - if( $wgUseMemoryTables ) $sql .= " TYPE=heap"; + print "Deleting old data in brokenlinks table.\n"; + $sql = "DELETE FROM brokenlinks"; + wfQuery( $sql, DB_WRITE ); + + print "Deleting old data in imagelinks table.\n"; + $sql = "DELETE FROM imagelinks"; wfQuery( $sql, DB_WRITE ); - $sql = "LOCK TABLES cur READ, rebuildlinks WRITE, interwiki READ, user_newtalk READ"; + print "\nAdding temporary unique index on links, brokenlinks and imagelinks.\n"; + print "->If build aborts now, you probably aborted a previous build. If that is\n"; + print " the case, you can clean up the remains with the following SQL commands,\n"; + print " and then try again.\n"; + print " ALTER TABLE links DROP INDEX tmp_unique;\n"; + print " ALTER TABLE brokenlinks DROP INDEX tmp_unique;\n"; + print " ALTER TABLE imagelinks DROP INDEX tmp_unique;\n\n"; + + $sql = "ALTER TABLE links ADD UNIQUE tmp_unique (l_from, l_to)"; + wfQuery( $sql, DB_WRITE ); + $sql = "ALTER TABLE brokenlinks ADD UNIQUE tmp_unique (bl_from, bl_to)"; + wfQuery( $sql, DB_WRITE ); + $sql = "ALTER TABLE imagelinks ADD UNIQUE tmp_unique (il_from, il_to(244))"; wfQuery( $sql, DB_WRITE ); + print "Temporary unique index added ok. Forget what I said.\n\n"; - $sql = "DELETE FROM rebuildlinks"; + print "Locking tables\n"; + $sql = "LOCK TABLES cur READ, interwiki READ, user_newtalk READ, " . + "links WRITE, brokenlinks WRITE, imagelinks WRITE"; wfQuery( $sql, DB_WRITE ); - $sql = "SELECT cur_id,cur_namespace,cur_title,cur_text FROM cur"; - $res = wfQuery( $sql, DB_WRITE ); - $total = wfNumRows( $res ); + print "Finding number of articles to process\n"; + $sql = "SELECT COUNT(*) as count FROM cur"; + $res = wfQuery( $sql, DB_READ ); + $obj = wfFetchObject( $res ); + $total = $obj->count; + + print "Finding highest article id\n"; + $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur"; + $res = wfQuery( $sql, DB_READ ); + $obj = wfFetchObject( $res ); + + $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " . + "FROM cur WHERE cur_id ", + $obj->min, $obj->max, $rowbuf_size); + + $brokenlinks_inserter = new InsertBuffer( + "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size); + + $links_inserter = new InsertBuffer( + "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size); + + $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ". + "(il_from,il_to) VALUES ", $rowbuf_size); + + print "Starting processing\n"; + + $ins = $wgLang->getNsText( Namespace::getImage() ); + $inslen = strlen($ins)+1; $tc = Title::legalChars(); - while ( $row = wfFetchObject( $res ) ) { - $id = $row->cur_id; + + $start_time = time(); + while ( $row = $cur_pulser->next() ) { + $from_id = $row->cur_id; $ns = $wgLang->getNsText( $row->cur_namespace ); - if ( "" == $ns ) { - $title = addslashes( $row->cur_title ); - } else { - $title = addslashes( "$ns:{$row->cur_title}" ); + + $raw_title = $row->cur_title; + if ( "" != $ns ) { + $raw_title = "$ns:{$raw_title}"; } + $title = addslashes( $raw_title ); $text = $row->cur_text; + $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text, $m, PREG_PATTERN_ORDER ); - if ( 0 != $numlinks ) { - $first = true; - $sql = "INSERT INTO rebuildlinks (rl_f_id,rl_f_title,rl_to) VALUES "; - for ( $i = 0; $i < $numlinks; ++$i ) { - if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) { - # an URL link; not for us! - continue; - } - # FIXME: Handle subpage links - $nt = Title::newFromText( $m[1][$i] ); - if (! $nt) - { - $txt = $m[1][$i]; - print "error in '$ns:{$row->cur_title}' :\t'$txt'\n"; - continue; - } - if( $nt->getInterwiki() != "" ) { - # Interwiki links are not stored in the link tables - continue; - } - if( $nt->getNamespace() == Namespace::getSpecial() ) { - # Special links not stored in link tables - continue; - } - if( $nt->getNamespace() == Namespace::getMedia() ) { - # treat media: links as image: links - $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() ); - } - - if (!$first) - $sql .= ","; - else - $first = false; - - $dest = addslashes( $nt->getPrefixedDBkey() ); - $sql .= "({$id},'{$title}','{$dest}')"; + for ( $i = 0; $i < $numlinks; ++$i ) { + if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) { + # an URL link; not for us! + continue; + } + + # FIXME: Handle subpage links + $nt = Title::newFromText( $m[1][$i] ); + + if (! $nt) + { + $txt = $m[1][$i]; + print "error in '$ns:{$row->cur_title}' :\t'$txt'\n"; + continue; + } + if( $nt->getInterwiki() != "" ) { + # Interwiki links are not stored in the link tables + continue; } + if( $nt->getNamespace() == Namespace::getSpecial() ) { + # Special links not stored in link tables + continue; + } + if( $nt->getNamespace() == Namespace::getMedia() ) { + # treat media: links as image: links + $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() ); + } + + $dest = addslashes( $nt->getPrefixedDBkey() ); + $dest_id = $nt->getArticleID(); - if (! $first) { wfQuery( $sql, DB_WRITE ); } + if ( 0 == strncmp( "$ins:", $raw_title, $inslen ) ) { + $iname = addslashes( substr( $raw_title, $inslen ) ); + $imagelinks_inserter->insert( "('{$title}','{$iname}')" ); + } else if ( 0 == $dest_id ) { + $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" ); + } else { + $links_inserter->insert( "('{$title}',{$dest_id})" ); + } } + + if ( ( $count % 10 ) == 0 ) + print "."; + if ( ( ++$count % 1000 ) == 0 ) { - print "$count of $total articles scanned.\n"; + $dt = time() - $start_time; + $start_time = time(); + $rps = ($dt == 0 ? "lots of" : intval(1000/$dt)); + print "\n$count of $total articles scanned ({$rps} articles per second)\n"; } + } - print "$total articles scanned.\n"; - mysql_free_result( $res ); - $sql = "UNLOCK TABLES"; - wfQuery( $sql, DB_WRITE ); -} + $imagelinks_inserter->flush(); + $links_inserter->flush(); + $brokenlinks_inserter->flush(); -function rebuildLinkTablesPass2() -{ - global $wgLang; - $count = 0; - print "Rebuilding link tables (pass 2).\n"; + print "$total articles scanned.\n"; - $sql = "LOCK TABLES cur READ, rebuildlinks READ, interwiki READ, " . - "links WRITE, brokenlinks WRITE, imagelinks WRITE"; + print "Removing temporary unique indexes from tables links, brokenlinks and imagelinks.\n"; + $sql = "ALTER TABLE links DROP INDEX tmp_unique"; wfQuery( $sql, DB_WRITE ); - - $sql = "DELETE FROM links"; + $sql = "ALTER TABLE brokenlinks DROP INDEX tmp_unique"; wfQuery( $sql, DB_WRITE ); - - $sql = "DELETE FROM brokenlinks"; + $sql = "ALTER TABLE imagelinks DROP INDEX tmp_unique"; wfQuery( $sql, DB_WRITE ); - $sql = "DELETE FROM imagelinks"; + $sql = "UNLOCK TABLES"; wfQuery( $sql, DB_WRITE ); + print "Done\n"; +} - $ins = $wgLang->getNsText( Namespace::getImage() ); - $inslen = strlen($ins)+1; - $sql = "SELECT rl_f_title,rl_to FROM rebuildlinks " . - "WHERE rl_to LIKE '$ins:%'"; - $res = wfQuery( $sql, DB_WRITE ); - - $sql = "INSERT INTO imagelinks (il_from,il_to) VALUES "; - $first = true; - while ( $row = wfFetchObject( $res ) ) - { - $iname = addslashes( substr( $row->rl_to, $inslen ) ); - $pname = addslashes( $row->rl_f_title ); - - if ( ! $first ) - $sql .= ","; - else - $first = false; - - $sql .= "('{$pname}','{$iname}')"; +# InsertBuffer increases performance slightly by inserting many rows +# at once. The gain is small (<5%) when running against a local, idle +# database, but may be significant in other circumstances. It also +# limits the number of inserted rows uppwards, which should avoid +# problems with huge articles and certain mysql settings that limits +# the size of queries. It's also convenient. + +class InsertBuffer { + /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize; + + function InsertBuffer( $sql, $bufsize ){ + $this->mSql = $sql; + $this->mBuf = array(); + $this->mBufcount = 0; + $this->mMaxsize = $bufsize; } - wfFreeResult( $res ); - if ( ! $first ) { wfQuery( $sql, DB_WRITE ); } - $sql = "SELECT DISTINCT rl_to FROM rebuildlinks ORDER BY rl_to"; - $res = wfQuery( $sql, DB_WRITE ); - $count = 0; - $total = wfNumRows( $res ); - - while ( $row = wfFetchObject( $res ) ) { - if ( 0 == strncmp( "$ins:", $row->rl_to, $inslen ) ) { continue; } - - $nt = Title::newFromDBkey( $row->rl_to ); - if (! $nt) - { - print "error pass2: '{$row->rl_to}'\n"; - continue; + function insert( $value ){ + // print $this->mSql . " -> " . $value . "\n"; + $this->mBuf[] = $value; + $this->mBufcount++; + if($this->mBufcount > $this->mMaxsize){ + $this->flush(); } - $id = $nt->getArticleID(); - $to = addslashes( $row->rl_to ); - - if ( 0 == $id ) { - $sql = "SELECT DISTINCT rl_f_id FROM rebuildlinks WHERE rl_to='{$to}'"; - $res2 = wfQuery( $sql, DB_WRITE ); + } - $sql = "INSERT INTO brokenlinks (bl_from,bl_to) VALUES "; - $first = true; - while ( $row2 = wfFetchObject( $res2 ) ) - { - if (! $first) - $sql .= ","; - else - $first = false; + function flush(){ + if( $this->mBufcount > 0 ){ + $sql = $this->mSql . implode(",", $this->mBuf); + wfQuery( $sql, DB_WRITE ); + $this->mBuf = array(); + $this->mBufcount = 0; + // print "Wrote query of size " . strlen( $sql ) . "\n"; + } + } + +} - $from = $row2->rl_f_id; - $sql .= "({$from},'{$to}')"; +# Select parts from a large table by using the "BETWEEN X AND Y" +# operator on the id column. Avoids buffering the whole thing in +# RAM. It's also convenient. +class SelectPulser { + /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet; + + function SelectPulser( $sql, $min, $max, $setsize) { + $this->mSql = $sql; + $this->mSet = array(); + $this->mPos = $min; + $this->mMax = $max; + $this->mSetsize = $setsize; + } + + function next(){ + $result = current( $this->mSet ); + next( $this->mSet ); + if( false !== $result ){ + return $result; + } + while( $this->mPos <= $this->mMax ){ + $this->mSet = array(); + $sql = $this->mSql . " BETWEEN " . $this->mPos . + " AND " . ($this->mPos + $this->mSetsize - 1); + $this->mPos += $this->mSetsize; + + $res = wfQuery( $sql, DB_READ ); + while ( $row = wfFetchObject( $res ) ) { + $this->mSet[] = $row; } - wfFreeResult( $res2 ); - if ( ! $first ) { wfQuery( $sql, DB_WRITE ); } - } else { - $sql = "SELECT DISTINCT rl_f_title FROM rebuildlinks WHERE rl_to='{$to}'"; - $res2 = wfQuery( $sql, DB_WRITE ); - - $sql = "INSERT INTO links (l_from,l_to) VALUES "; - $first = true; - while ( $row2 = wfFetchObject( $res2 ) ) - { - if (! $first) - $sql .= ","; - else - $first = false; - $from = addslashes( $row2->rl_f_title ); - $sql .= "('{$from}',{$id})"; - } - wfFreeResult( $res2 ); - if ( ! $first ) { wfQuery( $sql, DB_WRITE ); } - } - if ( ( ++$count % 1000 ) == 0 ) { - print "$count of $total titles processed.\n"; + if( count( $this->mSet ) > 0 ){ + return $this->next(); + } } + return false; } - wfFreeResult( $res ); - - $sql = "UNLOCK TABLES"; - wfQuery( $sql, DB_WRITE ); - - $sql = "DROP TABLE rebuildlinks"; - wfQuery( $sql, DB_WRITE ); } + ?> diff --git a/maintenance/rebuildlinks.php b/maintenance/rebuildlinks.php index 9405a037c4..8423a6a001 100644 --- a/maintenance/rebuildlinks.php +++ b/maintenance/rebuildlinks.php @@ -25,8 +25,7 @@ set_time_limit(0); $wgDBuser = $wgDBadminuser; $wgDBpassword = $wgDBadminpassword; -rebuildLinkTablesPass1(); -rebuildLinkTablesPass2(); +rebuildLinkTables(); print "Done.\n"; exit();