count; print "$total\n"; print "Finding highest article id\n"; $sql = "SELECT MIN(cur_id) AS min, MAX(cur_id) AS max FROM cur"; $res = wfQuery( $sql, DB_READ ); $obj = wfFetchObject( $res ); $cur_pulser = new SelectPulser("SELECT cur_id,cur_namespace,cur_title,cur_text " . "FROM cur WHERE cur_id ", $obj->min, $obj->max, 100); $brokenlinks_inserter = new InsertBuffer( "INSERT IGNORE INTO brokenlinks (bl_from,bl_to) VALUES " , $rowbuf_size); $links_inserter = new InsertBuffer( "INSERT IGNORE INTO links (l_from,l_to) VALUES ", $rowbuf_size); $imagelinks_inserter = new InsertBuffer("INSERT IGNORE INTO imagelinks ". "(il_from,il_to) VALUES ", $rowbuf_size); print "Starting processing\n"; $ins = $wgLang->getNsText( Namespace::getImage() ); $inslen = strlen($ins)+1; $tc = Title::legalChars(); $titleCache = new MRUCache( 10000 ); $titlecount = 0; $start_time = time(); while ( $row = $cur_pulser->next() ) { $from_id = intval($row->cur_id); $ns = $wgLang->getNsText( $row->cur_namespace ); $from_full_title = $row->cur_title; if ( "" != $ns ) { $from_full_title = "$ns:{$from_full_title}"; } $from_full_title_with_slashes = addslashes( $from_full_title ); $text = $row->cur_text; $numlinks = preg_match_all( "/\\[\\[([{$tc}]+)(]|\\|)/", $text, $m, PREG_PATTERN_ORDER ); $seen_dbtitles = array(); // seen links (normalized and with ns, see below) $titles_ready_for_insertion = array(); $titles_needing_curdata = array(); $titles_needing_curdata_pos = array(); $links_corresponding_to_titles = array(); for ( $i = 0 ; $i < $numlinks; ++$i ) { $link = $m[1][$i]; if( preg_match( '/^(http|https|ftp|mailto|news):/', $m[1][$i] ) ) { # an URL link; not for us! continue; } # FIXME: Handle subpage links $nt = $titleCache->get( $link ); if( $nt != false ){ // Only process each unique link once per page $nt_key = $nt->getDBkey() . $nt->getNamespace(); if( isset( $seen_dbtitles[$nt_key] ) ) continue; $seen_dbtitles[$nt_key] = 1; $titles_ready_for_insertion[] = $nt; } else { $nt = Title::newFromText( $link ); if (! $nt) { // Invalid link, probably something like "[[ ]]" continue; } // Only process each unique link once per page $nt_key = $nt->getDBkey() . $nt->getNamespace(); if( isset( $seen_dbtitles[$nt_key] ) ) continue; $seen_dbtitles[$nt_key] = 1; if( $nt->getInterwiki() != "" ) { # Interwiki links are not stored in the link tables continue; } if( $nt->getNamespace() == Namespace::getSpecial() ) { # Special links not stored in link tables continue; } if( $nt->getNamespace() == Namespace::getMedia() ) { # treat media: links as image: links $nt = Title::makeTitle( Namespace::getImage(), $nt->getDBkey() ); } $nt->mArticleID = 0; // assume broken link until proven otherwise $pos = array_push($titles_needing_curdata, $nt) - 1; $titles_needing_curdata_pos[$nt->getDBkey() . $nt->getNamespace()] = $pos; $links_corresponding_to_titles[] = $link; unset( $link ); // useless outside this loop, but tempting } } if ( count( $titles_needing_curdata ) > 0 ){ $parts = array(); foreach ($titles_needing_curdata as $nt ) { $parts[] = " (cur_namespace = " . $nt->getNamespace() . " AND " . "cur_title='" . wfStrencode( $nt->getDBkey() ) . "')"; } $sql = "SELECT cur_namespace, cur_title, cur_id FROM cur_fast WHERE " . implode(" OR ", $parts); $res = wfQuery( $sql, DB_WRITE ); while($row = wfFetchObject( $res ) ){ $pos = $titles_needing_curdata_pos[$row->cur_title . $row->cur_namespace]; $titles_needing_curdata[$pos]->mArticleID = intval($row->cur_id); } for( $k = 0; $k < count( $titles_needing_curdata ) ; $k++) { $tmplink = $links_corresponding_to_titles[$k]; $titleCache->set( $tmplink, $titles_needing_curdata[$k] ); $titles_ready_for_insertion[] = $titles_needing_curdata[$k]; } } foreach ( $titles_ready_for_insertion as $nt ) { $dest_noslashes = $nt->getPrefixedDBkey(); $dest = addslashes( $dest_noslashes ); $dest_id = $nt->getArticleID(); $from = $from_full_title_with_slashes; # print "\nLINK '$from_full_title' ($from_id) -> '$dest' ($dest_id)\n"; if ( 0 == strncmp( "$ins:", $dest_noslashes, $inslen ) ) { $iname = addslashes( substr( $dest_noslashes, $inslen ) ); $imagelinks_inserter->insert( "('{$from}','{$iname}')" ); } else if ( 0 == $dest_id ) { $brokenlinks_inserter->insert( "({$from_id},'{$dest}')" ); } else { $links_inserter->insert( "('{$from}',{$dest_id})" ); } $titlecount++; } if ( ( $count % 20 ) == 0 ) print "."; if ( ( ++$count % 1000 ) == 0 ) { $dt = time() - $start_time; $start_time = time(); $rps = persec(1000, $dt); $tps = persec($titlecount, $dt); $titlecount = 0; print "\n$count of $total articles scanned ({$rps} articles ". "and {$tps} titles per second)\n"; print "Title cache hits: " . $titleCache->getPerformance() . "%\n"; } } print "\nFlushing insertion buffers..."; $imagelinks_inserter->flush(); $links_inserter->flush(); $brokenlinks_inserter->flush(); print "ok\n"; print "$count articles scanned.\n"; $sql = "UNLOCK TABLES"; wfQuery( $sql, DB_WRITE ); print "Done\n"; } /* private */ function persec($n, $t){ if($n == 0) return "zero"; if($t == 0) return "lots of"; return intval($n/$t); } # InsertBuffer increases performance slightly by inserting many rows # at once. The gain is small (<5%) when running against a local, idle # database, but may be significant in other circumstances. It also # limits the number of inserted rows uppwards, which should avoid # problems with huge articles and certain mysql settings that limits # the size of queries. It's also convenient. class InsertBuffer { /* private */ var $mBuf, $mSql, $mBufcount, $mMaxsize; function InsertBuffer( $sql, $bufsize ){ $this->mSql = $sql; $this->mBuf = array(); $this->mBufcount = 0; $this->mMaxsize = $bufsize; } function insert( $value ){ // print $this->mSql . " -> " . $value . "\n"; $this->mBuf[] = $value; $this->mBufcount++; if($this->mBufcount > $this->mMaxsize){ $this->flush(); } } function flush(){ if( $this->mBufcount > 0 ){ $sql = $this->mSql . implode(",", $this->mBuf); wfQuery( $sql, DB_WRITE ); $this->mBuf = array(); $this->mBufcount = 0; // print "Wrote query of size " . strlen( $sql ) . "\n"; } } } # Select parts from a large table by using the "BETWEEN X AND Y" # operator on the id column. Avoids buffering the whole thing in # RAM. It's also convenient. class SelectPulser { /* private */ var $mSql, $mSetsize, $mPos, $mMax, $mSet; function SelectPulser( $sql, $min, $max, $setsize) { $this->mSql = $sql; $this->mSet = array(); $this->mPos = $min; $this->mMax = $max; $this->mSetsize = $setsize; } function next(){ $result = current( $this->mSet ); next( $this->mSet ); if( false !== $result ){ return $result; } while( $this->mPos <= $this->mMax ){ $this->mSet = array(); $sql = $this->mSql . " BETWEEN " . $this->mPos . " AND " . ($this->mPos + $this->mSetsize - 1); $this->mPos += $this->mSetsize; $res = wfQuery( $sql, DB_READ ); while ( $row = wfFetchObject( $res ) ) { $this->mSet[] = $row; } wfFreeResult( $res ); if( count( $this->mSet ) > 0 ){ return $this->next(); } } return false; } } # A simple MRU for general cacheing. class MRUCache { /* private */ var $mMru, $mCache, $mSize, $mPurgefreq, $nexti; /* private */ var $hits, $misses; function MRUCache( $size, $purgefreq = -1 ) { // purgefreq is 1/10 of $size if not stated $purgefreq = ($purgefreq == -1 ? intval($size/10) : $purgefreq); $purgefreq = ($purgefreq <= 0 ? 1 : $purgefreq); $this->mSize = $size; $this->mMru = array(); $this->mCache = array(); $this->mPurgefreq = $purgefreq; $this->nexti = 1; print "purgefreq = " . $this->mPurgefreq . "\n"; } function get( $key ){ if ( ! array_key_exists( $key, $this->mCache) ){ $this->misses++; return false; } $this->hits++; $this->mMru[$key] = $this->nexti++; return $this->mCache[$key]; } function set( $key, $value ){ $this->mMru[$key] = $this->nexti++; $this->mCache[$key] = $value; if($this->nexti % $this->mPurgefreq == 0) $this->purge(); } function purge(){ $to_remove = count( $this->mMru ) - $this->mSize; if( $to_remove <= 0 ){ return; } asort( $this->mMru ); $removed = array_splice( $this->mMru, 0, $to_remove ); foreach( array_keys( $removed ) as $key ){ unset( $this->mCache[$key] ); } } function getPerformance(){ $tot = $this->hits + $this->misses; if($tot > 0) return intval(100.0 * $this->hits / $tot); else return 0; } } ?>