From 05c146c191bfa58eb2b530a52c57962f94e864a7 Mon Sep 17 00:00:00 2001 From: "Merlijn S. van Deen" Date: Wed, 7 Jan 2009 19:51:36 +0000 Subject: [PATCH] Recommit of r45431 with these changes: * Removed backspace characters in output * Small code update * Removed 'disabled' comment from refreshLinks.php --- maintenance/refreshLinks.inc | 95 +++++++++++++++++++++++++++--------- maintenance/refreshLinks.php | 25 +++++----- 2 files changed, 84 insertions(+), 36 deletions(-) diff --git a/maintenance/refreshLinks.inc b/maintenance/refreshLinks.inc index 036d4109c8..f38426a765 100644 --- a/maintenance/refreshLinks.inc +++ b/maintenance/refreshLinks.inc @@ -136,41 +136,90 @@ function fixLinksFromArticle( $id ) { $dbw->immediateCommit(); } -function deleteLinksFromNonexistent( $maxLag = 0 ) { +/* + * Removes non-existing links from pages from pagelinks, imagelinks, + * categorylinks, templatelinks and externallinks tables. + * + * @param $maxLag + * @param $batchSize The size of deletion batches + * + * @author Merlijn van Deen + */ +function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) { $fname = 'deleteLinksFromNonexistent'; - wfWaitForSlaves( $maxLag ); - + $dbw = wfGetDB( DB_MASTER ); - - $linksTables = array( + $dbr = wfGetDB( DB_SLAVE ); + $dbr->bufferResults(false); + + $linksTables = array( // table name => page_id field 'pagelinks' => 'pl_from', 'imagelinks' => 'il_from', 'categorylinks' => 'cl_from', 'templatelinks' => 'tl_from', 'externallinks' => 'el_from', ); - - $page = $dbw->tableName( 'page' ); - - + + $readPage = $dbr->tableName( 'page' ); + foreach ( $linksTables as $table => $field ) { - if ( !$dbw->ping() ) { - print "DB disconnected, reconnecting..."; - while ( !$dbw->ping() ) { - print "."; - sleep(10); + $readLinks = $dbr->tableName( $table ); + + print "Retrieving illegal entries from $table... "; + + $sql = "SELECT DISTINCT( $field ) FROM $readLinks LEFT JOIN $readPage ON $field=page_id WHERE page_id IS NULL;"; + $results = $dbr->query( $sql, $fname . ':' . $readLinks ); + + print $results->numRows() . " illegal " . $field. "s. "; + + if ( $results->numRows() > 0 ) { + $counter = 0; + $list = array(); + print "Removing illegal links: 1.."; + + foreach( $results as $row ) { + $counter++; + $list[] = $row->$field; + if ( ( $counter % $batchSize ) == 0 ) { + print $counter . ".."; + deleteBatch($dbw, $table, $field, $list); + $list = array(); + } } - print "\n"; + print $counter; + deleteBatch($dbw, $table, $field, $list); } - - $pTable = $dbw->tableName( $table ); - $sql = "DELETE $pTable FROM $pTable LEFT JOIN $page ON page_id=$field WHERE page_id IS NULL"; - - print "Deleting $table from non-existent articles..."; - $dbw->query( $sql, $fname ); - print " fixed " .$dbw->affectedRows() . " row(s)\n"; + + print "\n"; } } -?> +/* Deletes a batch of items from a table. + * Runs the query: DELETE FROM <$table> WHERE <$field> IN (<$list>) + * + * @param $dbw Database Database object to run the DELETE query on + * @param $table table to work on; will be converted via $dbw->tableName. + * @param $field column to search in + * @param $list values to remove. Array with SQL-safe (!) values. + * + * @author Merlijn van Deen + */ +function deleteBatch($dbw, $table, $field, $list) { + if (count($list) == 0) return; + + $masterLinks = $dbw->tableName( $table ); + $fname = "deleteBatch:masterLinks"; + + if ( !$dbw->ping() ) { + print "\nDB disconnected, reconnecting..."; + while ( !$dbw->ping() ) { + print "."; + sleep(10); + } + print "\n"; + } + + $sql = "DELETE FROM $masterLinks WHERE $field IN (" . join("," , $list) . ");"; + $dbw->query($sql, $fname); +} diff --git a/maintenance/refreshLinks.php b/maintenance/refreshLinks.php index 4893d58019..aa91e1b510 100644 --- a/maintenance/refreshLinks.php +++ b/maintenance/refreshLinks.php @@ -18,14 +18,16 @@ Usage: [--new-only] [--redirects-only] php refreshLinks.php [] [-e ] [-m ] --old-redirects-only - --help : This help message - --dfn-only : Delete links from nonexistent articles only - --new-only : Only affect articles with just a single edit - --redirects-only : Only fix redirects, not all links - --old-redirects-only : Only fix redirects with no redirect table entry - -m : Maximum replication lag - : First page id to refresh - -e : Last page id to refresh + --help : This help message + --dfn-only : Delete links from nonexistent articles only + --batch-size : The delete batch size when removing links from + nonexistent articles (default 100) + --new-only : Only affect articles with just a single edit + --redirects-only : Only fix redirects, not all links + --old-redirects-only : Only fix redirects with no redirect table entry + -m : Maximum replication lag + : First page id to refresh + -e : Last page id to refresh TEXT; exit(0); @@ -42,12 +44,9 @@ if ( !$options['dfn-only'] ) { refreshLinks( $start, $options['new-only'], $options['m'], $options['e'], $options['redirects-only'], $options['old-redirects-only'] ); } -// this bit's bad for replication: disabling temporarily -// --brion 2005-07-16 -//deleteLinksFromNonexistent(); + +deleteLinksFromNonexistent($options['m'], $options['batch-size']); if ( $options['globals'] ) { print_r( $GLOBALS ); } - - -- 2.20.1