Only store currently-existing categories in the categories table
authorBrad Jorsch <bjorsch@wikimedia.org>
Wed, 13 Jul 2016 15:30:37 +0000 (11:30 -0400)
committerBrad Jorsch <bjorsch@wikimedia.org>
Mon, 18 Jul 2016 16:52:19 +0000 (12:52 -0400)
A "currently-existing category" is defined as a category that either
contains any pages or has a description page. Thus:
* Category::initialize() now schedules an update to insert a row if the
  title exits but the row is missing.
* Category::refreshCounts() now removes the row if the title doesn't
  exist and the category is empty.
* WikiPage::onArticleCreate() loads the Category object, to trigger
  bullet #1.
* WikiPage::updateCategoryCounts() refreshes the counts if it results in
  the row showing 0 pages, to trigger bullet #2.
* LinksDeletionUpdate refreshes the counts if the row shows 0 pages, to
  trigger bullet #2.

A maintenance script is provided to update the category table for this
new definition.

Bug: T28411
Bug: T50824
Change-Id: I0f0adf124c181ae5d3c7c95b3b5fb275a725794c

autoload.php
includes/Category.php
includes/deferred/LinksDeletionUpdate.php
includes/installer/DatabaseUpdater.php
includes/page/WikiPage.php
maintenance/cleanupEmptyCategories.php [new file with mode: 0644]
maintenance/mssql/tables.sql
maintenance/tables.sql

index d82d699..9dff36a 100644 (file)
@@ -245,6 +245,7 @@ $wgAutoloadLocalClasses = [
        'ClassCollector' => __DIR__ . '/includes/utils/AutoloadGenerator.php',
        'CleanupAncientTables' => __DIR__ . '/maintenance/cleanupAncientTables.php',
        'CleanupBlocks' => __DIR__ . '/maintenance/cleanupBlocks.php',
+       'CleanupEmptyCategories' => __DIR__ . '/maintenance/cleanupEmptyCategories.php',
        'CleanupPreferences' => __DIR__ . '/maintenance/cleanupPreferences.php',
        'CleanupRemovedModules' => __DIR__ . '/maintenance/cleanupRemovedModules.php',
        'CleanupSpam' => __DIR__ . '/maintenance/cleanupSpam.php',
index 28b566a..531e0be 100644 (file)
@@ -79,6 +79,11 @@ class Category {
                                $this->mSubcats = 0;
                                $this->mFiles = 0;
 
+                               # If the title exists, call refreshCounts to add a row for it.
+                               if ( $this->mTitle->exists() ) {
+                                       DeferredUpdates::addCallableUpdate( [ $this, 'refreshCounts' ] );
+                               }
+
                                return true;
                        } else {
                                return false; # Fail
@@ -331,21 +336,35 @@ class Category {
                        [ 'LOCK IN SHARE MODE' ]
                );
 
+               $shouldExist = $result->pages > 0 || $this->getTitle()->exists();
+
                if ( $this->mID ) {
-                       # The category row already exists, so do a plain UPDATE instead
-                       # of INSERT...ON DUPLICATE KEY UPDATE to avoid creating a gap
-                       # in the cat_id sequence. The row may or may not be "affected".
-                       $dbw->update(
-                               'category',
-                               [
-                                       'cat_pages' => $result->pages,
-                                       'cat_subcats' => $result->subcats,
-                                       'cat_files' => $result->files
-                               ],
-                               [ 'cat_title' => $this->mName ],
-                               __METHOD__
-                       );
-               } else {
+                       if ( $shouldExist ) {
+                               # The category row already exists, so do a plain UPDATE instead
+                               # of INSERT...ON DUPLICATE KEY UPDATE to avoid creating a gap
+                               # in the cat_id sequence. The row may or may not be "affected".
+                               $dbw->update(
+                                       'category',
+                                       [
+                                               'cat_pages' => $result->pages,
+                                               'cat_subcats' => $result->subcats,
+                                               'cat_files' => $result->files
+                                       ],
+                                       [ 'cat_title' => $this->mName ],
+                                       __METHOD__
+                               );
+                       } else {
+                               # The category is empty and has no description page, delete it
+                               $dbw->delete(
+                                       'category',
+                                       [ 'cat_title' => $this->mName ],
+                                       __METHOD__
+                               );
+                               $this->mID = false;
+                       }
+               } elseif ( $shouldExist ) {
+                       # The category row doesn't exist but should, so create it. Use
+                       # upsert in case of races.
                        $dbw->upsert(
                                'category',
                                [
@@ -362,6 +381,8 @@ class Category {
                                ],
                                __METHOD__
                        );
+                       // @todo: Should we update $this->mID here? Or not since Category
+                       // objects tend to be short lived enough to not matter?
                }
 
                $dbw->endAtomic( __METHOD__ );
index a7c39ca..b60ed8a 100644 (file)
@@ -61,6 +61,8 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate
                // This handles the case when updates have to batched into several COMMITs.
                $scopedLock = LinksUpdate::acquirePageLock( $this->mDb, $id );
 
+               $title = $this->page->getTitle();
+
                // Delete restrictions for it
                $this->mDb->delete( 'page_restrictions', [ 'pr_page' => $id ], __METHOD__ );
 
@@ -80,6 +82,20 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate
                        }
                }
 
+               // Refresh the category table entry if it seems to have no pages. Check
+               // master for the most up-to-date cat_pages count.
+               if ( $title->getNamespace() === NS_CATEGORY ) {
+                       $row = $this->mDb->selectRow(
+                               'category',
+                               [ 'cat_id', 'cat_title', 'cat_pages', 'cat_subcats', 'cat_files' ],
+                               [ 'cat_title' => $title->getDBkey(), 'cat_pages <= 0' ],
+                               __METHOD__
+                       );
+                       if ( $row ) {
+                               $cat = Category::newFromRow( $row, $title )->refreshCounts();
+                       }
+               }
+
                // If using cascading deletes, we can skip some explicit deletes
                if ( !$this->mDb->cascadingDeletes() ) {
                        // Delete outgoing links
@@ -132,7 +148,6 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate
 
                // If using cleanup triggers, we can skip some manual deletes
                if ( !$this->mDb->cleanupTriggers() ) {
-                       $title = $this->page->getTitle();
                        // Find recentchanges entries to clean up...
                        $rcIdsForTitle = $this->mDb->selectFieldValues(
                                'recentchanges',
index 6a20abc..0d8137c 100644 (file)
@@ -75,6 +75,7 @@ abstract class DatabaseUpdater {
                PopulateFilearchiveSha1::class,
                PopulateBacklinkNamespace::class,
                FixDefaultJsonContentPages::class,
+               CleanupEmptyCategories::class,
        ];
 
        /**
index b06b519..b64604e 100644 (file)
@@ -3279,6 +3279,14 @@ class WikiPage implements Page, IDBAccessObject {
                $title->touchLinks();
                $title->purgeSquid();
                $title->deleteTitleProtection();
+
+               if ( $title->getNamespace() == NS_CATEGORY ) {
+                       // Load the Category object, which will schedule a job to create
+                       // the category table row if necessary. Checking a slave is ok
+                       // here, in the worst case it'll run an unnecessary recount job on
+                       // a category that probably doesn't have many members.
+                       Category::newFromTitle( $title )->getID();
+               }
        }
 
        /**
@@ -3525,6 +3533,22 @@ class WikiPage implements Page, IDBAccessObject {
                                        $cat = Category::newFromName( $catName );
                                        Hooks::run( 'CategoryAfterPageRemoved', [ $cat, $this, $id ] );
                                }
+
+                               // Refresh counts on categories that should be empty now, to
+                               // trigger possible deletion. Check master for the most
+                               // up-to-date cat_pages.
+                               if ( count( $deleted ) ) {
+                                       $rows = $dbw->select(
+                                               'category',
+                                               [ 'cat_id', 'cat_title', 'cat_pages', 'cat_subcats', 'cat_files' ],
+                                               [ 'cat_title' => $deleted, 'cat_pages <= 0' ],
+                                               $method
+                                       );
+                                       foreach ( $rows as $row ) {
+                                               $cat = Category::newFromRow( $row );
+                                               $cat->refreshCounts();
+                                       }
+                               }
                        }
                );
        }
diff --git a/maintenance/cleanupEmptyCategories.php b/maintenance/cleanupEmptyCategories.php
new file mode 100644 (file)
index 0000000..b8a246e
--- /dev/null
@@ -0,0 +1,204 @@
+<?php
+/**
+ * Clean up empty categories in the category table.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script to clean up empty categories in the category table.
+ *
+ * @ingroup Maintenance
+ * @since 1.28
+ */
+class CleanupEmptyCategories extends LoggedUpdateMaintenance {
+
+       public function __construct() {
+               parent::__construct();
+               $this->addDescription(
+                       <<<TEXT
+This script will clean up the category table by removing entries for empty
+categories without a description page and adding entries for empty categories
+with a description page. It will print out progress indicators every batch. The
+script is perfectly safe to run on large, live wikis, and running it multiple
+times is harmless. You may want to use the throttling options if it's causing
+too much load; they will not affect correctness.
+
+If the script is stopped and later resumed, you can use the --mode and --begin
+options with the last printed progress indicator to pick up where you left off.
+
+When the script has finished, it will make a note of this in the database, and
+will not run again without the --force option.
+TEXT
+               );
+
+               $this->addOption(
+                       'mode',
+                       '"add" empty categories with description pages, "remove" empty categories '
+                       . 'without description pages, or "both"',
+                       false,
+                       true
+               );
+               $this->addOption(
+                       'begin',
+                       'Only do categories whose names are alphabetically after the provided name',
+                       false,
+                       true
+               );
+               $this->addOption(
+                       'throttle',
+                       'Wait this many milliseconds after each batch. Default: 0',
+                       false,
+                       true
+               );
+       }
+
+       protected function getUpdateKey() {
+               return 'cleanup empty categories';
+       }
+
+       protected function doDBUpdates() {
+               $mode = $this->getOption( 'mode', 'both' );
+               $begin = $this->getOption( 'begin', '' );
+               $throttle = $this->getOption( 'throttle', 0 );
+
+               if ( !in_array( $mode, [ 'add', 'remove', 'both' ] ) ) {
+                       $this->output( "--mode must be 'add', 'remove', or 'both'.\n" );
+                       return false;
+               }
+
+               $dbw = $this->getDB( DB_MASTER );
+
+               $throttle = intval( $throttle );
+
+               if ( $mode === 'add' || $mode === 'both' ) {
+                       if ( $begin !== '' ) {
+                               $where = [ 'page_title > ' . $dbw->addQuotes( $begin ) ];
+                       } else {
+                               $where = [];
+                       }
+
+                       $this->output( "Adding empty categories with description pages...\n" );
+                       while ( true ) {
+                               # Find which category to update
+                               $rows = $dbw->select(
+                                       [ 'page', 'category' ],
+                                       'page_title',
+                                       array_merge( $where, [
+                                               'page_namespace' => NS_CATEGORY,
+                                               'cat_title' => null,
+                                       ] ),
+                                       __METHOD__,
+                                       [
+                                               'ORDER BY' => 'page_title',
+                                               'LIMIT' => $this->mBatchSize,
+                                       ],
+                                       [
+                                               'category' => [ 'LEFT JOIN', 'page_title = cat_title' ],
+                                       ]
+                               );
+                               if ( !$rows || $rows->numRows() <= 0 ) {
+                                       # Done, hopefully.
+                                       break;
+                               }
+
+                               foreach ( $rows as $row ) {
+                                       $name = $row->page_title;
+                                       $where = [ 'page_title > ' . $dbw->addQuotes( $name ) ];
+
+                                       # Use the row to update the category count
+                                       $cat = Category::newFromName( $name );
+                                       if ( !is_object( $cat ) ) {
+                                               $this->output( "The category named $name is not valid?!\n" );
+                                       } else {
+                                               $cat->refreshCounts();
+                                       }
+                               }
+                               $this->output( "--mode=$mode --begin=$name\n" );
+
+                               wfWaitForSlaves();
+                               usleep( $throttle * 1000 );
+                       }
+
+                       $begin = '';
+               }
+
+               if ( $mode === 'remove' || $mode === 'both' ) {
+                       if ( $begin !== '' ) {
+                               $where = [ 'cat_title > ' . $dbw->addQuotes( $begin ) ];
+                       } else {
+                               $where = [];
+                       }
+                       $i = 0;
+
+                       $this->output( "Removing empty categories without description pages...\n" );
+                       while ( true ) {
+                               # Find which category to update
+                               $rows = $dbw->select(
+                                       [ 'category', 'page' ],
+                                       'cat_title',
+                                       array_merge( $where, [
+                                               'page_title' => null,
+                                               'cat_pages' => 0,
+                                       ] ),
+                                       __METHOD__,
+                                       [
+                                               'ORDER BY' => 'cat_title',
+                                               'LIMIT' => $this->mBatchSize,
+                                       ],
+                                       [
+                                               'page' => [ 'LEFT JOIN', [
+                                                       'page_namespace' => NS_CATEGORY, 'page_title = cat_title'
+                                               ] ],
+                                       ]
+                               );
+                               if ( !$rows || $rows->numRows() <= 0 ) {
+                                       # Done, hopefully.
+                                       break;
+                               }
+                               foreach ( $rows as $row ) {
+                                       $name = $row->cat_title;
+                                       $where = [ 'cat_title > ' . $dbw->addQuotes( $name ) ];
+
+                                       # Use the row to update the category count
+                                       $cat = Category::newFromName( $name );
+                                       if ( !is_object( $cat ) ) {
+                                               $this->output( "The category named $name is not valid?!\n" );
+                                       } else {
+                                               $cat->refreshCounts();
+                                       }
+                               }
+
+                               $this->output( "--mode=remove --begin=$name\n" );
+
+                               wfWaitForSlaves();
+                               usleep( $throttle * 1000 );
+                       }
+               }
+
+               $this->output( "Category cleanup complete.\n" );
+
+               return true;
+       }
+}
+
+$maintClass = 'CleanupEmptyCategories';
+require_once RUN_MAINTENANCE_IF_MAIN;
index 12cfed8..48b2250 100644 (file)
@@ -336,9 +336,9 @@ CREATE INDEX /*i*/cl_timestamp ON /*_*/categorylinks (cl_to,cl_timestamp);
 CREATE INDEX /*i*/cl_collation_ext ON /*_*/categorylinks (cl_collation, cl_to, cl_type, cl_from);
 
 --
--- Track all existing categories.  Something is a category if 1) it has an en-
--- try somewhere in categorylinks, or 2) it once did.  Categories might not
--- have corresponding pages, so they need to be tracked separately.
+-- Track all existing categories. Something is a category if 1) it has an entry
+-- somewhere in categorylinks, or 2) it has a description page. Categories
+-- might not have corresponding pages, so they need to be tracked separately.
 --
 CREATE TABLE /*_*/category (
   -- Primary key
index 89aeb9c..9c9bdfb 100644 (file)
@@ -623,9 +623,9 @@ CREATE INDEX /*i*/cl_timestamp ON /*_*/categorylinks (cl_to,cl_timestamp);
 CREATE INDEX /*i*/cl_collation_ext ON /*_*/categorylinks (cl_collation, cl_to, cl_type, cl_from);
 
 --
--- Track all existing categories.  Something is a category if 1) it has an en-
--- try somewhere in categorylinks, or 2) it once did.  Categories might not
--- have corresponding pages, so they need to be tracked separately.
+-- Track all existing categories. Something is a category if 1) it has an entry
+-- somewhere in categorylinks, or 2) it has a description page. Categories
+-- might not have corresponding pages, so they need to be tracked separately.
 --
 CREATE TABLE /*_*/category (
   -- Primary key