* Maintenance script to delete non-current revisions
* Maintenance script to wipe a page and all revisions from the database
* Maintenance script to reassign edits from one user to another
+* Maintenance script to find and remove links to a given domain (cleanupSpam.php)
i18n / Languages:
* Partial support for Basque language (from wikipedia and meta)
// Reverse the labels in the hostname, convert to lower case
$reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
// Add an extra dot to the end
- if ( substr( $reversedHost, -1 ) !== '.' ) {
+ if ( substr( $reversedHost, -1, 1 ) !== '.' ) {
$reversedHost .= '.';
}
// Reconstruct the pseudo-URL
return $index;
}
+/**
+ * Do any deferred updates and clear the list
+ * TODO: This could be in Wiki.php if that class made any sense at all
+ */
+function wfDoUpdates()
+{
+ global $wgPostCommitUpdateList, $wgDeferredUpdateList;
+ foreach ( $wgDeferredUpdateList as $update ) {
+ $update->doUpdate();
+ }
+ foreach ( $wgPostCommitUpdateList as $update ) {
+ $update->doUpdate();
+ }
+ $wgDeferredUpdateList = array();
+ $wgPostCommitUpdateList = array();
+}
+
?>
--- /dev/null
+<?php
+
+/**
+ * Some functions to help implement an external link filter for spam control.
+ *
+ * TODO: implement the filter. Currently these are just some functions to help
+ * maintenance/cleanupSpam.php remove links to a single specified domain. The
+ * next thing is to implement functions for checking a given page against a big
+ * list of domains.
+ *
+ * Another cool thing to do would be a web interface for fast spam removal.
+ */
+class LinkFilter {
+ /**
+ * @static
+ */
+ function matchEntry( $text, $filterEntry ) {
+ $regex = LinkFilter::makeRegex( $filterEntry );
+ return preg_match( $regex, $text );
+ }
+
+ /**
+ * @static
+ */
+ function makeRegex( $filterEntry ) {
+ $regex = '!http://';
+ if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
+ $regex .= '([A-Za-z0-9.-]+\.|)';
+ $filterEntry = substr( $filterEntry, 2 );
+ }
+ $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
+ return $regex;
+ }
+
+ /**
+ * Make a string to go after an SQL LIKE, which will match the specified
+ * string. There are several kinds of filter entry:
+ * *.domain.com - Produces http://com.domain.%, matches domain.com
+ * and www.domain.com
+ * domain.com - Produces http://com.domain./%, matches domain.com
+ * or domain.com/ but not www.domain.com
+ * *.domain.com/x - Produces http://com.domain.%/x%, matches
+ * www.domain.com/xy
+ * domain.com/x - Produces http://com.domain./x%, matches
+ * domain.com/xy but not www.domain.com/xy
+ *
+ * Asterisks in any other location are considered invalid.
+ *
+ * @static
+ */
+ function makeLike( $filterEntry ) {
+ if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
+ $subdomains = true;
+ $filterEntry = substr( $filterEntry, 2 );
+ if ( $filterEntry == '' ) {
+ // We don't want to make a clause that will match everything,
+ // that could be dangerous
+ return false;
+ }
+ } else {
+ $subdomains = false;
+ }
+ // No stray asterisks, that could cause confusion
+ // It's not simple or efficient to handle it properly so we don't
+ // handle it at all.
+ if ( strpos( $filterEntry, '*' ) !== false ) {
+ return false;
+ }
+ $slash = strpos( $filterEntry, '/' );
+ if ( $slash !== false ) {
+ $path = substr( $filterEntry, $slash );
+ $host = substr( $filterEntry, 0, $slash );
+ } else {
+ $path = '/';
+ $host = $filterEntry;
+ }
+ $host = strtolower( implode( '.', array_reverse( explode( '.', $host ) ) ) );
+ if ( substr( $host, -1, 1 ) !== '.' ) {
+ $host .= '.';
+ }
+ $like = "http://$host";
+
+ if ( $subdomains ) {
+ $like .= '%';
+ }
+ if ( !$subdomains || $path !== '/' ) {
+ $like .= $path . '%';
+ }
+ return $like;
+ }
+}
+?>
'categoryarticlecount1' => "There is $1 article in this category.",
'usenewcategorypage' => "1\n\nSet first character to \"0\" to disable the new category page layout.",
'listingcontinuesabbrev' => " cont.",
+'spambot_username' => 'MediaWiki spam cleanup',
+'spam_reverting' => 'Reverting to last version not containing links to $1',
+'spam_blanking' => 'All revisions contained links to $1, blanking',
# Info page
'infosubtitle' => 'Information for page',
);
-?>
\ No newline at end of file
+?>
--- /dev/null
+<?php
+
+require_once( 'commandLine.inc' );
+require_once( "$IP/includes/LinkFilter.php" );
+
+function cleanupArticle( $id, $domain ) {
+ $title = Title::newFromID( $id );
+ if ( !$title ) {
+ print "Internal error: no page for ID $id\n";
+ return;
+ }
+
+ print $title->getPrefixedDBkey() . " ...";
+ $rev = Revision::newFromTitle( $title );
+ $reverted = false;
+ $revId = $rev->getId();
+ $currentRevId = $revId;
+ $regex = LinkFilter::makeRegex( $domain );
+
+ while ( $rev && preg_match( $regex, $rev->getText() ) ) {
+ # Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26)
+ #$rev = $rev->getPrevious();
+ $revId = $title->getPreviousRevisionID( $revId );
+ if ( $revId ) {
+ $rev = Revision::newFromTitle( $title, $revId );
+ } else {
+ $rev = false;
+ }
+ }
+ if ( $revId == $currentRevId ) {
+ // The regex didn't match the current article text
+ // This happens e.g. when a link comes from a template rather than the page itself
+ print "False match\n";
+ } else {
+ $dbw =& wfGetDB( DB_MASTER );
+ $dbw->immediateBegin();
+ if ( !$rev ) {
+ // Didn't find a non-spammy revision, blank the page
+ print "blanking\n";
+ $article = new Article( $title );
+ $article->updateArticle( '', wfMsg( 'spam_blanking', $domain ),
+ false, false );
+
+ } else {
+ // Revert to this revision
+ print "reverting\n";
+ $article = new Article( $title );
+ $article->updateArticle( $rev->getText(), wfMsg( 'spam_reverting', $domain ), false, false );
+ }
+ $dbw->immediateCommit();
+ wfDoUpdates();
+ }
+}
+//------------------------------------------------------------------------------
+
+$username = wfMsg( 'spambot_username' );
+$fname = $username;
+$wgUser = User::newFromName( $username );
+// Create the user if necessary
+if ( !$wgUser->getID() ) {
+ $wgUser->addToDatabase();
+}
+
+if ( !isset( $args[0] ) ) {
+ print "Usage: php cleanupSpam.php <hostname>\n";
+ exit(1);
+}
+$spec = $args[0];
+$like = LinkFilter::makeLike( $spec );
+if ( !$like ) {
+ print "Not a valid hostname specification: $spec\n";
+ exit(1);
+}
+
+$dbr =& wfGetDB( DB_SLAVE );
+
+$res = $dbr->select( 'externallinks', array( 'el_from' ),
+ array( 'el_index LIKE ' . $dbr->addQuotes( $like ) ), $fname );
+$count = $dbr->numRows( $res );
+print "Found $count articles containing $spec\n";
+while ( $row = $dbr->fetchObject( $res ) ) {
+ cleanupArticle( $row->el_from, $spec );
+}
+if ( $count ) {
+ print "Done\n";
+}
+
+?>