Merge "New maintenance script to clean up rows with invalid DB keys"
authorjenkins-bot <jenkins-bot@gerrit.wikimedia.org>
Mon, 24 Apr 2017 15:19:29 +0000 (15:19 +0000)
committerGerrit Code Review <gerrit@wikimedia.org>
Mon, 24 Apr 2017 15:19:30 +0000 (15:19 +0000)
autoload.php
includes/libs/rdbms/database/Database.php
includes/libs/rdbms/database/DatabaseMssql.php
includes/libs/rdbms/database/DatabaseMysqlBase.php
includes/libs/rdbms/database/DatabaseSqlite.php
maintenance/cleanupInvalidDbKeys.php [new file with mode: 0644]
tests/phpunit/includes/db/DatabaseSQLTest.php

index 09e42f6..f609ffc 100644 (file)
@@ -259,6 +259,7 @@ $wgAutoloadLocalClasses = [
        'CleanupAncientTables' => __DIR__ . '/maintenance/cleanupAncientTables.php',
        'CleanupBlocks' => __DIR__ . '/maintenance/cleanupBlocks.php',
        'CleanupEmptyCategories' => __DIR__ . '/maintenance/cleanupEmptyCategories.php',
+       'CleanupInvalidDbKeys' => __DIR__ . '/maintenance/cleanupInvalidDbKeys.php',
        'CleanupPreferences' => __DIR__ . '/maintenance/cleanupPreferences.php',
        'CleanupRemovedModules' => __DIR__ . '/maintenance/cleanupRemovedModules.php',
        'CleanupSpam' => __DIR__ . '/maintenance/cleanupSpam.php',
index 0915b7d..88a883a 100644 (file)
@@ -2019,8 +2019,10 @@ abstract class Database implements IDatabase, IMaintainableDatabase, LoggerAware
         * @param string $s
         * @return string
         */
-       protected function escapeLikeInternal( $s ) {
-               return addcslashes( $s, '\%_' );
+       protected function escapeLikeInternal( $s, $escapeChar = '`' ) {
+               return str_replace( [ $escapeChar, '%', '_' ],
+                       [ "{$escapeChar}{$escapeChar}", "{$escapeChar}%", "{$escapeChar}_" ],
+                       $s );
        }
 
        public function buildLike() {
@@ -2032,15 +2034,21 @@ abstract class Database implements IDatabase, IMaintainableDatabase, LoggerAware
 
                $s = '';
 
+               // We use ` instead of \ as the default LIKE escape character, since addQuotes()
+               // may escape backslashes, creating problems of double escaping. The `
+               // character has good cross-DBMS compatibility, avoiding special operators
+               // in MS SQL like ^ and %
+               $escapeChar = '`';
+
                foreach ( $params as $value ) {
                        if ( $value instanceof LikeMatch ) {
                                $s .= $value->toString();
                        } else {
-                               $s .= $this->escapeLikeInternal( $value );
+                               $s .= $this->escapeLikeInternal( $value, $escapeChar );
                        }
                }
 
-               return " LIKE {$this->addQuotes( $s )} ";
+               return ' LIKE ' . $this->addQuotes( $s ) . ' ESCAPE ' . $this->addQuotes( $escapeChar ) . ' ';
        }
 
        public function anyChar() {
index c736cc7..782727a 100644 (file)
@@ -1110,27 +1110,11 @@ class DatabaseMssql extends Database {
         * @param string $s
         * @return string
         */
-       protected function escapeLikeInternal( $s ) {
-               return addcslashes( $s, '\%_[]^' );
-       }
-
-       /**
-        * MS SQL requires specifying the escape character used in a LIKE query
-        * or using Square brackets to surround characters that are to be escaped
-        * https://msdn.microsoft.com/en-us/library/ms179859.aspx
-        * Here we take the Specify-Escape-Character approach since it's less
-        * invasive, renders a query that is closer to other DB's and better at
-        * handling square bracket escaping
-        *
-        * @return string Fully built LIKE statement
-        */
-       public function buildLike() {
-               $params = func_get_args();
-               if ( count( $params ) > 0 && is_array( $params[0] ) ) {
-                       $params = $params[0];
-               }
-
-               return parent::buildLike( $params ) . " ESCAPE '\' ";
+       protected function escapeLikeInternal( $s, $escapeChar = '`' ) {
+               return str_replace( [ $escapeChar, '%', '_', '[', ']', '^' ],
+                       [ "{$escapeChar}{$escapeChar}", "{$escapeChar}%", "{$escapeChar}_",
+                               "{$escapeChar}[", "{$escapeChar}]", "{$escapeChar}^" ],
+                       $s );
        }
 
        /**
index 9824caf..50ead83 100644 (file)
@@ -532,9 +532,11 @@ abstract class DatabaseMysqlBase extends Database {
                        return true; // already known to exist and won't show in SHOW TABLES anyway
                }
 
-               $encLike = $this->buildLike( $table );
+               // We can't use buildLike() here, because it specifies an escape character
+               // other than the backslash, which is the only one supported by SHOW TABLES
+               $encLike = $this->escapeLikeInternal( $table, '\\' );
 
-               return $this->query( "SHOW TABLES $encLike", $fname )->numRows() > 0;
+               return $this->query( "SHOW TABLES LIKE '$encLike'", $fname )->numRows() > 0;
        }
 
        /**
index 779b4c9..60b6855 100644 (file)
@@ -812,18 +812,6 @@ class DatabaseSqlite extends Database {
                }
        }
 
-       /**
-        * @return string
-        */
-       function buildLike() {
-               $params = func_get_args();
-               if ( count( $params ) > 0 && is_array( $params[0] ) ) {
-                       $params = $params[0];
-               }
-
-               return parent::buildLike( $params ) . "ESCAPE '\' ";
-       }
-
        /**
         * @param string $field Field or column to cast
         * @return string
diff --git a/maintenance/cleanupInvalidDbKeys.php b/maintenance/cleanupInvalidDbKeys.php
new file mode 100644 (file)
index 0000000..423686e
--- /dev/null
@@ -0,0 +1,310 @@
+<?php
+/**
+ * Cleans up invalid titles in various tables.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script that cleans up invalid titles in various tables.
+ *
+ * @since 1.29
+ * @ingroup Maintenance
+ */
+class CleanupInvalidDbKeys extends Maintenance {
+       /** @var array List of tables to clean up, and the field prefix for that table */
+       protected static $tables = [
+               // Data tables
+               [ 'page', 'page' ],
+               [ 'redirect', 'rd', 'idField' => 'rd_from' ],
+               [ 'archive', 'ar' ],
+               [ 'logging', 'log' ],
+               [ 'protected_titles', 'pt', 'idField' => 0 ],
+               [ 'category', 'cat', 'nsField' => 14 ],
+               [ 'recentchanges', 'rc' ],
+               [ 'watchlist', 'wl' ],
+               // The querycache tables' qc(c)_title and qcc_titletwo may contain titles,
+               // but also usernames or other things like that, so we leave them alone
+
+               // Links tables
+               [ 'pagelinks', 'pl', 'idField' => 'pl_from' ],
+               [ 'templatelinks', 'tl', 'idField' => 'tl_from' ],
+               [ 'categorylinks', 'cl', 'idField' => 'cl_from', 'nsField' => 14, 'titleField' => 'cl_to' ],
+       ];
+
+       public function __construct() {
+               parent::__construct();
+               $this->addDescription( <<<'TEXT'
+This script cleans up the title fields in various tables to remove entries that
+will be rejected by the constructor of TitleValue.  This constructor throws an
+exception when invalid data is encountered, which will not normally occur on
+regular page views, but can happen on query special pages.
+
+The script targets titles matching the regular expression /^_|[ \r\n\t]|_$/.
+Because any foreign key relationships involving these titles will already be
+broken, the titles are corrected to a valid version or the rows are deleted
+entirely, depending on the table.
+
+Key progress output is printed to STDERR, while a full log of all entries that
+are deleted is sent to STDOUT. You are strongly advised to capture STDOUT into
+a file.
+TEXT
+               );
+               $this->addOption( 'fix', 'Actually clean up invalid titles. If this parameter is ' .
+                       'not specified, the script will report invalid titles but not clean them up.',
+                       false, false );
+               $this->addOption( 'table', 'The table(s) to process. This option can be specified ' .
+                       'more than once (e.g. -t category -t watchlist). If not specified, all available ' .
+                       'tables will be processed. Available tables are: ' .
+                       implode( ', ', array_column( static::$tables, 0 ) ), false, true, 't', true );
+
+               $this->setBatchSize( 500 );
+       }
+
+       public function execute() {
+               $tablesToProcess = $this->getOption( 'table' );
+               foreach ( static::$tables as $tableParams ) {
+                       if ( !$tablesToProcess || in_array( $tableParams[0], $tablesToProcess ) ) {
+                               $this->cleanupTable( $tableParams );
+                       }
+               }
+
+               $this->output( 'Done! Cleaned up invalid DB keys on ' . wfWikiID() . "!\n" );
+       }
+
+       /**
+        * Prints text to STDOUT, and STDERR if STDOUT was redirected to a file.
+        * Used for progress reporting.
+        *
+        * @param string $str Text to write to both places
+        * @param string|null $channel Ignored
+        */
+       protected function output( $str, $channel = null ) {
+               // Make it easier to find progress lines in the STDOUT log
+               if ( trim( $str ) ) {
+                       fwrite( STDOUT, '*** ' );
+               }
+               fwrite( STDERR, $str );
+       }
+
+       /**
+        * Prints text to STDOUT. Used for logging output.
+        *
+        * @param string $str Text to write
+        */
+       protected function writeToReport( $str ) {
+               fwrite( STDOUT, $str );
+       }
+
+       /**
+        * Identifies, and optionally cleans up, invalid titles.
+        *
+        * @param array $tableParams A child array of self::$tables
+        */
+       protected function cleanupTable( $tableParams ) {
+               $table = $tableParams[0];
+               $prefix = $tableParams[1];
+               $idField = isset( $tableParams['idField'] ) ?
+                       $tableParams['idField'] :
+                       "{$prefix}_id";
+               $nsField = isset( $tableParams['nsField'] ) ?
+                       $tableParams['nsField'] :
+                       "{$prefix}_namespace";
+               $titleField = isset( $tableParams['titleField'] ) ?
+                       $tableParams['titleField'] :
+                       "{$prefix}_title";
+
+               $this->output( "Looking for invalid $titleField entries in $table...\n" );
+
+               // Do all the select queries on the replicas, as they are slow (they use
+               // unanchored LIKEs). Naturally this could cause problems if rows are
+               // modified after selecting and before deleting/updating, but working on
+               // the hypothesis that invalid rows will be old and in all likelihood
+               // unreferenced, we should be fine to do it like this.
+               $dbr = $this->getDB( DB_REPLICA, 'vslow' );
+
+               // Find all TitleValue-invalid titles.
+               $percent = $dbr->anyString(); // DBMS-agnostic equivalent of '%' LIKE wildcard
+               $res = $dbr->select(
+                       $table,
+                       [
+                               'id' => $idField,
+                               'ns' => $nsField,
+                               'title' => $titleField,
+                       ],
+                       // The REGEXP operator is not cross-DBMS, so we have to use lots of LIKEs
+                       [ $dbr->makeList( [
+                               $titleField . $dbr->buildLike( $percent, ' ', $percent ),
+                               $titleField . $dbr->buildLike( $percent, '\r', $percent ),
+                               $titleField . $dbr->buildLike( $percent, '\n', $percent ),
+                               $titleField . $dbr->buildLike( $percent, '\t', $percent ),
+                               $titleField . $dbr->buildLike( '_', $percent ),
+                               $titleField . $dbr->buildLike( $percent, '_' ),
+                       ], LIST_OR ) ],
+                       __METHOD__,
+                       [ 'LIMIT' => $this->mBatchSize ]
+               );
+
+               $this->output( "Number of invalid rows: " . $res->numRows() . "\n" );
+               if ( !$res->numRows() ) {
+                       $this->output( "\n" );
+                       return;
+               }
+
+               // Write a table of titles to the report file. Also keep a list of the found
+               // IDs, as we might need it later for DB updates
+               $this->writeToReport( sprintf( "%10s |  ns | dbkey\n", $idField ) );
+               $ids = [];
+               foreach ( $res as $row ) {
+                       $this->writeToReport( sprintf( "%10d | %3d | %s\n", $row->id, $row->ns, $row->title ) );
+                       $ids[] = $row->id;
+               }
+
+               // If we're doing a dry run, output the new titles we would use for the UPDATE
+               // queries (if relevant), and finish
+               if ( !$this->hasOption( 'fix' ) ) {
+                       if ( $table === 'logging' || $table === 'archive' ) {
+                               $this->writeToReport( "The following updates would be run with the --fix flag:\n" );
+                               foreach ( $res as $row ) {
+                                       $newTitle = self::makeValidTitle( $row->title );
+                                       $this->writeToReport(
+                                               "$idField={$row->id}: update '{$row->title}' to '$newTitle'\n" );
+                               }
+                       }
+
+                       if ( $table !== 'page' && $table !== 'redirect' ) {
+                               $this->output( "Run with --fix to clean up these rows\n" );
+                       }
+                       $this->output( "\n" );
+                       return;
+               }
+
+               // Fix the bad data, using different logic for the various tables
+               $dbw = $this->getDB( DB_MASTER );
+               switch ( $table ) {
+                       case 'page':
+                       case 'redirect':
+                               // This shouldn't happen on production wikis, and we already have a script
+                               // to handle 'page' rows anyway, so just notify the user and let them decide
+                               // what to do next.
+                               $this->output( <<<TEXT
+IMPORTANT: This script does not fix invalid entries in the $table table.
+Consider repairing these rows, and rows in related tables, by hand.
+You may like to run, or borrow logic from, the cleanupTitles.php script.
+
+TEXT
+                               );
+                               break;
+
+                       case 'archive':
+                       case 'logging':
+                               // Rename the title to a corrected equivalent. Any foreign key relationships
+                               // to the page_title field are already broken, so this will just make sure
+                               // users can still access the log entries/deleted revisions from the interface
+                               // using a valid page title.
+                               $this->output(
+                                       "Updating these rows, setting $titleField to the closest valid DB key...\n" );
+                               $affectedRowCount = 0;
+                               foreach ( $res as $row ) {
+                                       $newTitle = self::makeValidTitle( $row->title );
+                                       $this->writeToReport(
+                                               "$idField={$row->id}: updating '{$row->title}' to '$newTitle'\n" );
+
+                                       $dbw->update( $table,
+                                               [ $titleField => $newTitle ],
+                                               [ $idField => $row->id ],
+                                               __METHOD__ );
+                                       $affectedRowCount += $dbw->affectedRows();
+                               }
+                               wfWaitForSlaves();
+                               $this->output( "Updated $affectedRowCount rows on $table.\n" );
+
+                               break;
+
+                       case 'recentchanges':
+                       case 'watchlist':
+                       case 'category':
+                               // Since these broken titles can't exist, there's really nothing to watch,
+                               // nothing can be categorised in them, and they can't have been changed
+                               // recently, so we can just remove these rows.
+                               $this->output( "Deleting invalid $table rows...\n" );
+                               $dbw->delete( $table, [ $idField => $ids ], __METHOD__ );
+                               wfWaitForSlaves();
+                               $this->output( 'Deleted ' . $dbw->affectedRows() . " rows from $table.\n" );
+                               break;
+
+                       case 'protected_titles':
+                               // Since these broken titles can't exist, there's really nothing to protect,
+                               // so we can just remove these rows. Made more complicated by this table
+                               // not having an ID field
+                               $this->output( "Deleting invalid $table rows...\n" );
+                               $affectedRowCount = 0;
+                               foreach ( $res as $row ) {
+                                       $dbw->delete( $table,
+                                               [ $nsField => $row->ns, $titleField => $row->title ],
+                                               __METHOD__ );
+                                       $affectedRowCount += $dbw->affectedRows();
+                               }
+                               wfWaitForSlaves();
+                               $this->output( "Deleted $affectedRowCount rows from $table.\n" );
+                               break;
+
+                       case 'pagelinks':
+                       case 'templatelinks':
+                       case 'categorylinks':
+                               // Update links tables for each page where these bogus links are supposedly
+                               // located. If the invalid rows don't go away after these jobs go through,
+                               // they're probably being added by a buggy hook.
+                               $this->output( "Queueing link update jobs for the pages in $idField...\n" );
+                               foreach ( $res as $row ) {
+                                       $wp = WikiPage::newFromID( $row->id );
+                                       if ( $wp ) {
+                                               RefreshLinks::fixLinksFromArticle( $row->id );
+                                       } else {
+                                               // This link entry points to a nonexistent page, so just get rid of it
+                                               $dbw->delete( $table,
+                                                       [ $idField => $row->id, $nsField => $row->ns, $titleField => $row->title ],
+                                                       __METHOD__ );
+                                       }
+                               }
+                               wfWaitForSlaves();
+                               $this->output( "Link update jobs have been added to the job queue.\n" );
+                               break;
+               }
+
+               $this->output( "\n" );
+               return;
+       }
+
+       /**
+        * Fix possible validation issues in the given title (DB key).
+        *
+        * @param string $invalidTitle
+        * @return string
+        */
+       protected static function makeValidTitle( $invalidTitle ) {
+               return strtr( trim( $invalidTitle, '_' ),
+                       [ ' ' => '_', "\r" => '', "\n" => '', "\t" => '_' ] );
+       }
+}
+
+$maintClass = 'CleanupInvalidDbKeys';
+require_once RUN_MAINTENANCE_IF_MAIN;
index 57754bc..b6088ff 100644 (file)
@@ -660,19 +660,31 @@ class DatabaseSQLTest extends MediaWikiTestCase {
                return [
                        [
                                'text',
-                               "LIKE 'text'"
+                               "LIKE 'text' ESCAPE '`'"
                        ],
                        [
                                [ 'text', new LikeMatch( '%' ) ],
-                               "LIKE 'text%'"
+                               "LIKE 'text%' ESCAPE '`'"
                        ],
                        [
                                [ 'text', new LikeMatch( '%' ), 'text2' ],
-                               "LIKE 'text%text2'"
+                               "LIKE 'text%text2' ESCAPE '`'"
                        ],
                        [
                                [ 'text', new LikeMatch( '_' ) ],
-                               "LIKE 'text_'"
+                               "LIKE 'text_' ESCAPE '`'"
+                       ],
+                       [
+                               'more_text',
+                               "LIKE 'more`_text' ESCAPE '`'"
+                       ],
+                       [
+                               [ 'C:\\Windows\\', new LikeMatch( '%' ) ],
+                               "LIKE 'C:\\Windows\\%' ESCAPE '`'"
+                       ],
+                       [
+                               [ 'accent`_test`', new LikeMatch( '%' ) ],
+                               "LIKE 'accent```_test``%' ESCAPE '`'"
                        ],
                ];
        }