From 34912bff40a595a9c34fa68683be53d6669e146a Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Fri, 27 Jan 2006 23:37:19 +0000 Subject: [PATCH] Maintenance script to find and remove links to a given domain (cleanupSpam.php) --- RELEASE-NOTES | 1 + includes/GlobalFunctions.php | 19 +++++++- includes/LinkFilter.php | 92 ++++++++++++++++++++++++++++++++++++ languages/Messages.php | 5 +- maintenance/cleanupSpam.php | 88 ++++++++++++++++++++++++++++++++++ 5 files changed, 203 insertions(+), 2 deletions(-) create mode 100644 includes/LinkFilter.php create mode 100644 maintenance/cleanupSpam.php diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 5b50d8a19e..adf76cc9c7 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -136,6 +136,7 @@ Maintenance: * Maintenance script to delete non-current revisions * Maintenance script to wipe a page and all revisions from the database * Maintenance script to reassign edits from one user to another +* Maintenance script to find and remove links to a given domain (cleanupSpam.php) i18n / Languages: * Partial support for Basque language (from wikipedia and meta) diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index a3f5b4a2d7..607cd0dc96 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -1821,7 +1821,7 @@ function wfMakeUrlIndex( $url ) { // Reverse the labels in the hostname, convert to lower case $reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) ); // Add an extra dot to the end - if ( substr( $reversedHost, -1 ) !== '.' ) { + if ( substr( $reversedHost, -1, 1 ) !== '.' ) { $reversedHost .= '.'; } // Reconstruct the pseudo-URL @@ -1838,4 +1838,21 @@ function wfMakeUrlIndex( $url ) { return $index; } +/** + * Do any deferred updates and clear the list + * TODO: This could be in Wiki.php if that class made any sense at all + */ +function wfDoUpdates() +{ + global $wgPostCommitUpdateList, $wgDeferredUpdateList; + foreach ( $wgDeferredUpdateList as $update ) { + $update->doUpdate(); + } + foreach ( $wgPostCommitUpdateList as $update ) { + $update->doUpdate(); + } + $wgDeferredUpdateList = array(); + $wgPostCommitUpdateList = array(); +} + ?> diff --git a/includes/LinkFilter.php b/includes/LinkFilter.php new file mode 100644 index 0000000000..0008c6a1e2 --- /dev/null +++ b/includes/LinkFilter.php @@ -0,0 +1,92 @@ + diff --git a/languages/Messages.php b/languages/Messages.php index fd8dc06012..5733cfd9af 100644 --- a/languages/Messages.php +++ b/languages/Messages.php @@ -1438,6 +1438,9 @@ In the latter case you can also use a link, e.g. [[{{ns:Special}}:Export/{{Media 'categoryarticlecount1' => "There is $1 article in this category.", 'usenewcategorypage' => "1\n\nSet first character to \"0\" to disable the new category page layout.", 'listingcontinuesabbrev' => " cont.", +'spambot_username' => 'MediaWiki spam cleanup', +'spam_reverting' => 'Reverting to last version not containing links to $1', +'spam_blanking' => 'All revisions contained links to $1, blanking', # Info page 'infosubtitle' => 'Information for page', @@ -1927,4 +1930,4 @@ Please confirm that really want to recreate this article.', ); -?> \ No newline at end of file +?> diff --git a/maintenance/cleanupSpam.php b/maintenance/cleanupSpam.php new file mode 100644 index 0000000000..01b1f631cb --- /dev/null +++ b/maintenance/cleanupSpam.php @@ -0,0 +1,88 @@ +getPrefixedDBkey() . " ..."; + $rev = Revision::newFromTitle( $title ); + $reverted = false; + $revId = $rev->getId(); + $currentRevId = $revId; + $regex = LinkFilter::makeRegex( $domain ); + + while ( $rev && preg_match( $regex, $rev->getText() ) ) { + # Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26) + #$rev = $rev->getPrevious(); + $revId = $title->getPreviousRevisionID( $revId ); + if ( $revId ) { + $rev = Revision::newFromTitle( $title, $revId ); + } else { + $rev = false; + } + } + if ( $revId == $currentRevId ) { + // The regex didn't match the current article text + // This happens e.g. when a link comes from a template rather than the page itself + print "False match\n"; + } else { + $dbw =& wfGetDB( DB_MASTER ); + $dbw->immediateBegin(); + if ( !$rev ) { + // Didn't find a non-spammy revision, blank the page + print "blanking\n"; + $article = new Article( $title ); + $article->updateArticle( '', wfMsg( 'spam_blanking', $domain ), + false, false ); + + } else { + // Revert to this revision + print "reverting\n"; + $article = new Article( $title ); + $article->updateArticle( $rev->getText(), wfMsg( 'spam_reverting', $domain ), false, false ); + } + $dbw->immediateCommit(); + wfDoUpdates(); + } +} +//------------------------------------------------------------------------------ + +$username = wfMsg( 'spambot_username' ); +$fname = $username; +$wgUser = User::newFromName( $username ); +// Create the user if necessary +if ( !$wgUser->getID() ) { + $wgUser->addToDatabase(); +} + +if ( !isset( $args[0] ) ) { + print "Usage: php cleanupSpam.php \n"; + exit(1); +} +$spec = $args[0]; +$like = LinkFilter::makeLike( $spec ); +if ( !$like ) { + print "Not a valid hostname specification: $spec\n"; + exit(1); +} + +$dbr =& wfGetDB( DB_SLAVE ); + +$res = $dbr->select( 'externallinks', array( 'el_from' ), + array( 'el_index LIKE ' . $dbr->addQuotes( $like ) ), $fname ); +$count = $dbr->numRows( $res ); +print "Found $count articles containing $spec\n"; +while ( $row = $dbr->fetchObject( $res ) ) { + cleanupArticle( $row->el_from, $spec ); +} +if ( $count ) { + print "Done\n"; +} + +?> -- 2.20.1