From d7b9e71465b59d9e03e1ca1cfd691631d7d8b42e Mon Sep 17 00:00:00 2001 From: "Merlijn S. van Deen" Date: Tue, 6 Jan 2009 02:10:22 +0000 Subject: [PATCH] Updated deleteLinksFromNonexistent function: - refreshLinks.inc: * New algorithm, conform Brions description in bug #16112; instead of one big delete, it is split up in blocks of (by default) 100 incorrect page_ids to remove. * Added function parameters - refreshLinks.php * New command-line parameter to set the number of page_ids to clean per batch. * Re-instated deleteLinksFromNonexistent run --- maintenance/refreshLinks.inc | 92 +++++++++++++++++++++++++++--------- maintenance/refreshLinks.php | 22 ++++----- 2 files changed, 81 insertions(+), 33 deletions(-) diff --git a/maintenance/refreshLinks.inc b/maintenance/refreshLinks.inc index 036d4109c8..3408d1b351 100644 --- a/maintenance/refreshLinks.inc +++ b/maintenance/refreshLinks.inc @@ -136,13 +136,23 @@ function fixLinksFromArticle( $id ) { $dbw->immediateCommit(); } -function deleteLinksFromNonexistent( $maxLag = 0 ) { +/* + * Removes non-existing links from pages from pagelinks, imagelinks, + * categorylinks, templatelinks and externallinks tables. + * + * @param $maxLag + * @param $batchSize The size of deletion batches + * + * @author Merlijn van Deen + */ +function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) { $fname = 'deleteLinksFromNonexistent'; - wfWaitForSlaves( $maxLag ); - + $dbw = wfGetDB( DB_MASTER ); - + $dbr = wfGetDB( DB_SLAVE ); + $dbr->bufferResults(false); + $linksTables = array( 'pagelinks' => 'pl_from', 'imagelinks' => 'il_from', @@ -150,27 +160,65 @@ function deleteLinksFromNonexistent( $maxLag = 0 ) { 'templatelinks' => 'tl_from', 'externallinks' => 'el_from', ); - - $page = $dbw->tableName( 'page' ); - - + + + $readPage = $dbr->tableName( 'page' ); foreach ( $linksTables as $table => $field ) { - if ( !$dbw->ping() ) { - print "DB disconnected, reconnecting..."; - while ( !$dbw->ping() ) { - print "."; - sleep(10); - } + $readLinks = $dbr->tableName( $table ); + + $sql = "SELECT DISTINCT( $field ) FROM $readLinks LEFT JOIN $readPage ON $field=page_id WHERE page_id IS NULL;"; + print "Retrieving illegal entries from $table: \tRUNNING"; + + $results = $dbr->query( $sql, $fname . ':' . $readLinks ); + print "\x08\x08\x08\x08\x08\x08\x08" . $results->numRows() . " illegal " . $field. "s. "; + + if ( $results->numRows() == 0 ) { print "\n"; + continue; } - - $pTable = $dbw->tableName( $table ); - $sql = "DELETE $pTable FROM $pTable LEFT JOIN $page ON page_id=$field WHERE page_id IS NULL"; - - print "Deleting $table from non-existent articles..."; - $dbw->query( $sql, $fname ); - print " fixed " .$dbw->affectedRows() . " row(s)\n"; + + $counter = 0; + $list = array(); + print "Removing illegal links: 1.."; + foreach( $results as $row ) { + $counter++; + $list[] = $row->$field; + if ( ( $counter % $batchSize ) == 0 ) { + print $counter . ".."; + deleteBatch($dbw, $table, $field, $list); + $list = ''; + } + } + print $counter . "\n"; + deleteBatch($dbw, $table, $field, $list); } } -?> +/* Deletes a batch of items from a table. + * Runs the query: DELETE FROM <$table> WHERE <$field> IN (<$list>) + * + * @param $dbw Database Database object to run the DELETE query on + * @param $table table to work on; will be converted via $dbw->tableName. + * @param $field column to search in + * @param $list values to remove. Array with SQL-safe (!) values. + * + * @author Merlijn van Deen + */ +function deleteBatch($dbw, $table, $field, $list) { + if (count($list) == 0) return; + + $masterLinks = $dbw->tableName( $table ); + $fname = "deleteBatch:masterLinks"; + + if ( !$dbw->ping() ) { + print "\nDB disconnected, reconnecting..."; + while ( !$dbw->ping() ) { + print "."; + sleep(10); + } + print "\n"; + } + + $sql = "DELETE FROM $masterLinks WHERE $field IN (" . join("," , $list) . ");"; + $dbw->query($sql, $fname); +} diff --git a/maintenance/refreshLinks.php b/maintenance/refreshLinks.php index 4893d58019..81baa0714b 100644 --- a/maintenance/refreshLinks.php +++ b/maintenance/refreshLinks.php @@ -18,14 +18,16 @@ Usage: [--new-only] [--redirects-only] php refreshLinks.php [] [-e ] [-m ] --old-redirects-only - --help : This help message - --dfn-only : Delete links from nonexistent articles only - --new-only : Only affect articles with just a single edit - --redirects-only : Only fix redirects, not all links - --old-redirects-only : Only fix redirects with no redirect table entry - -m : Maximum replication lag - : First page id to refresh - -e : Last page id to refresh + --help : This help message + --dfn-only : Delete links from nonexistent articles only + --batch-size : The delete batch size when removing links from + nonexistent articles (default 100) + --new-only : Only affect articles with just a single edit + --redirects-only : Only fix redirects, not all links + --old-redirects-only : Only fix redirects with no redirect table entry + -m : Maximum replication lag + : First page id to refresh + -e : Last page id to refresh TEXT; exit(0); @@ -44,10 +46,8 @@ if ( !$options['dfn-only'] ) { } // this bit's bad for replication: disabling temporarily // --brion 2005-07-16 -//deleteLinksFromNonexistent(); +deleteLinksFromNonexistent($options['m'], $options['batch-size']); if ( $options['globals'] ) { print_r( $GLOBALS ); } - - -- 2.20.1