From 13054a4c70c6ee6cb25b32d460fe85a06dbaf69b Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Tue, 29 Nov 2016 15:04:07 -0800 Subject: [PATCH] refreshLinks.php: allow refreshing by categories, tracking or not Needed for selective updates of pages using a particular feature. Intended to be run in production, so needs to scale. Bug: T149723 Change-Id: If20fb1f91de8d4227def5b07d6d52b91161ed3fd --- RELEASE-NOTES-1.29 | 2 + autoload.php | 1 + includes/TrackingCategories.php | 130 ++++++++++++++++++ includes/parser/ParserOutput.php | 2 + .../specials/SpecialTrackingCategories.php | 103 +------------- maintenance/refreshLinks.php | 105 +++++++++++++- 6 files changed, 239 insertions(+), 104 deletions(-) create mode 100644 includes/TrackingCategories.php diff --git a/RELEASE-NOTES-1.29 b/RELEASE-NOTES-1.29 index d0738e276b..3bf50ac623 100644 --- a/RELEASE-NOTES-1.29 +++ b/RELEASE-NOTES-1.29 @@ -206,6 +206,8 @@ changes to languages because of Phabricator reports. * Article::doEditContent() was marked as deprecated, to be removed in 1.30 or later. * ContentHandler::runLegacyHooks() was removed. +* refreshLinks.php now can be limited to a particular category with --category=... + or a tracking category with --tracking-category=... == Compatibility == diff --git a/autoload.php b/autoload.php index 7ed08dfaee..e7c97ad049 100644 --- a/autoload.php +++ b/autoload.php @@ -1459,6 +1459,7 @@ $wgAutoloadLocalClasses = [ 'TitlePrefixSearch' => __DIR__ . '/includes/PrefixSearch.php', 'TitleValue' => __DIR__ . '/includes/title/TitleValue.php', 'TrackBlobs' => __DIR__ . '/maintenance/storage/trackBlobs.php', + 'TrackingCategories' => __DIR__ . '/includes/TrackingCategories.php', 'TraditionalImageGallery' => __DIR__ . '/includes/gallery/TraditionalImageGallery.php', 'TransactionProfiler' => __DIR__ . '/includes/libs/rdbms/TransactionProfiler.php', 'TransformParameterError' => __DIR__ . '/includes/media/MediaTransformOutput.php', diff --git a/includes/TrackingCategories.php b/includes/TrackingCategories.php new file mode 100644 index 0000000000..825860a5a9 --- /dev/null +++ b/includes/TrackingCategories.php @@ -0,0 +1,130 @@ +config = $config; + } + + /** + * Read the global and extract title objects from the corresponding messages + * @return array Array( 'msg' => Title, 'cats' => Title[] ) + */ + public function getTrackingCategories() { + $categories = array_merge( + self::$coreTrackingCategories, + ExtensionRegistry::getInstance()->getAttribute( 'TrackingCategories' ), + $this->config->get( 'TrackingCategories' ) // deprecated + ); + + // Only show magic link tracking categories if they are enabled + $enableMagicLinks = $this->config->get( 'EnableMagicLinks' ); + if ( $enableMagicLinks['ISBN'] ) { + $categories[] = 'magiclink-tracking-isbn'; + } + if ( $enableMagicLinks['RFC'] ) { + $categories[] = 'magiclink-tracking-rfc'; + } + if ( $enableMagicLinks['PMID'] ) { + $categories[] = 'magiclink-tracking-pmid'; + } + + $trackingCategories = []; + foreach ( $categories as $catMsg ) { + /* + * Check if the tracking category varies by namespace + * Otherwise only pages in the current namespace will be displayed + * If it does vary, show pages considering all namespaces + */ + $msgObj = wfMessage( $catMsg )->inContentLanguage(); + $allCats = []; + $catMsgTitle = Title::makeTitleSafe( NS_MEDIAWIKI, $catMsg ); + if ( !$catMsgTitle ) { + continue; + } + + // Match things like {{NAMESPACE}} and {{NAMESPACENUMBER}}. + // False positives are ok, this is just an efficiency shortcut + if ( strpos( $msgObj->plain(), '{{' ) !== false ) { + $ns = MWNamespace::getValidNamespaces(); + foreach ( $ns as $namesp ) { + $tempTitle = Title::makeTitleSafe( $namesp, $catMsg ); + if ( !$tempTitle ) { + continue; + } + $catName = $msgObj->title( $tempTitle )->text(); + # Allow tracking categories to be disabled by setting them to "-" + if ( $catName !== '-' ) { + $catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName ); + if ( $catTitle ) { + $allCats[] = $catTitle; + } + } + } + } else { + $catName = $msgObj->text(); + # Allow tracking categories to be disabled by setting them to "-" + if ( $catName !== '-' ) { + $catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName ); + if ( $catTitle ) { + $allCats[] = $catTitle; + } + } + } + $trackingCategories[$catMsg] = [ + 'cats' => $allCats, + 'msg' => $catMsgTitle, + ]; + } + + return $trackingCategories; + } +} diff --git a/includes/parser/ParserOutput.php b/includes/parser/ParserOutput.php index 7bf848fb90..0c162b4b26 100644 --- a/includes/parser/ParserOutput.php +++ b/includes/parser/ParserOutput.php @@ -696,6 +696,8 @@ class ParserOutput extends CacheTime { * to SpecialTrackingCategories::$coreTrackingCategories, and extensions * should add to "TrackingCategories" in their extension.json. * + * @todo Migrate some code to TrackingCategories + * * @param string $msg Message key * @param Title $title title of the page which is being tracked * @return bool Whether the addition was successful diff --git a/includes/specials/SpecialTrackingCategories.php b/includes/specials/SpecialTrackingCategories.php index 8ff052785e..e503d92b41 100644 --- a/includes/specials/SpecialTrackingCategories.php +++ b/includes/specials/SpecialTrackingCategories.php @@ -36,26 +36,6 @@ class SpecialTrackingCategories extends SpecialPage { parent::__construct( 'TrackingCategories' ); } - /** - * Tracking categories that exist in core - * - * @var array - */ - private static $coreTrackingCategories = [ - 'index-category', - 'noindex-category', - 'duplicate-args-category', - 'expensive-parserfunction-category', - 'post-expand-template-argument-category', - 'post-expand-template-inclusion-category', - 'hidden-category-category', - 'broken-file-category', - 'node-count-exceeded-category', - 'expansion-depth-exceeded-category', - 'restricted-displaytitle-ignored', - 'deprecated-self-close-category', - ]; - function execute( $par ) { $this->setHeaders(); $this->outputHeader(); @@ -76,10 +56,11 @@ class SpecialTrackingCategories extends SpecialPage { " ); - $trackingCategories = $this->prepareTrackingCategoriesData(); + $trackingCategories = new TrackingCategories( $this->getConfig() ); + $categoryList = $trackingCategories->getTrackingCategories(); $batch = new LinkBatch(); - foreach ( $trackingCategories as $catMsg => $data ) { + foreach ( $categoryList as $catMsg => $data ) { $batch->addObj( $data['msg'] ); foreach ( $data['cats'] as $catTitle ) { $batch->addObj( $catTitle ); @@ -87,11 +68,11 @@ class SpecialTrackingCategories extends SpecialPage { } $batch->execute(); - Hooks::run( 'SpecialTrackingCategories::preprocess', [ $this, $trackingCategories ] ); + Hooks::run( 'SpecialTrackingCategories::preprocess', [ $this, $categoryList ] ); $linkRenderer = $this->getLinkRenderer(); - foreach ( $trackingCategories as $catMsg => $data ) { + foreach ( $categoryList as $catMsg => $data ) { $allMsgs = []; $catDesc = $catMsg . '-desc'; @@ -143,80 +124,6 @@ class SpecialTrackingCategories extends SpecialPage { $this->getOutput()->addHTML( Html::closeElement( 'table' ) ); } - /** - * Read the global and extract title objects from the corresponding messages - * @return array Array( 'msg' => Title, 'cats' => Title[] ) - */ - private function prepareTrackingCategoriesData() { - $categories = array_merge( - self::$coreTrackingCategories, - ExtensionRegistry::getInstance()->getAttribute( 'TrackingCategories' ), - $this->getConfig()->get( 'TrackingCategories' ) // deprecated - ); - - // Only show magic link tracking categories if they are enabled - $enableMagicLinks = $this->getConfig()->get( 'EnableMagicLinks' ); - if ( $enableMagicLinks['ISBN'] ) { - $categories[] = 'magiclink-tracking-isbn'; - } - if ( $enableMagicLinks['RFC'] ) { - $categories[] = 'magiclink-tracking-rfc'; - } - if ( $enableMagicLinks['PMID'] ) { - $categories[] = 'magiclink-tracking-pmid'; - } - - $trackingCategories = []; - foreach ( $categories as $catMsg ) { - /* - * Check if the tracking category varies by namespace - * Otherwise only pages in the current namespace will be displayed - * If it does vary, show pages considering all namespaces - */ - $msgObj = $this->msg( $catMsg )->inContentLanguage(); - $allCats = []; - $catMsgTitle = Title::makeTitleSafe( NS_MEDIAWIKI, $catMsg ); - if ( !$catMsgTitle ) { - continue; - } - - // Match things like {{NAMESPACE}} and {{NAMESPACENUMBER}}. - // False positives are ok, this is just an efficiency shortcut - if ( strpos( $msgObj->plain(), '{{' ) !== false ) { - $ns = MWNamespace::getValidNamespaces(); - foreach ( $ns as $namesp ) { - $tempTitle = Title::makeTitleSafe( $namesp, $catMsg ); - if ( !$tempTitle ) { - continue; - } - $catName = $msgObj->title( $tempTitle )->text(); - # Allow tracking categories to be disabled by setting them to "-" - if ( $catName !== '-' ) { - $catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName ); - if ( $catTitle ) { - $allCats[] = $catTitle; - } - } - } - } else { - $catName = $msgObj->text(); - # Allow tracking categories to be disabled by setting them to "-" - if ( $catName !== '-' ) { - $catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName ); - if ( $catTitle ) { - $allCats[] = $catTitle; - } - } - } - $trackingCategories[$catMsg] = [ - 'cats' => $allCats, - 'msg' => $catMsgTitle, - ]; - } - - return $trackingCategories; - } - protected function getGroupName() { return 'pages'; } diff --git a/maintenance/refreshLinks.php b/maintenance/refreshLinks.php index e7a4d06902..fb24a1d0e2 100644 --- a/maintenance/refreshLinks.php +++ b/maintenance/refreshLinks.php @@ -29,6 +29,8 @@ require_once __DIR__ . '/Maintenance.php'; * @ingroup Maintenance */ class RefreshLinks extends Maintenance { + const REPORTING_INTERVAL = 100; + /** @var int|bool */ protected $namespace = false; @@ -43,6 +45,8 @@ class RefreshLinks extends Maintenance { $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' . 'query, default 100000', false, true ); $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true ); + $this->addOption( 'category', 'Only fix pages in this category', false, true ); + $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true ); $this->addArg( 'start', 'Page_id to start from, default 1', false ); $this->setBatchSize( 100 ); } @@ -61,7 +65,15 @@ class RefreshLinks extends Maintenance { } else { $this->namespace = (int)$ns; } - if ( !$this->hasOption( 'dfn-only' ) ) { + if ( ( $category = $this->getOption( 'category', false ) ) !== false ) { + $title = Title::makeTitleSafe( NS_CATEGORY, $category ); + if ( !$title ) { + $this->error( "'$category' is an invalid category name!\n", true ); + } + $this->refreshCategory( $category ); + } elseif ( ( $category = $this->getOption( 'tracking-category', false ) ) !== false ) { + $this->refreshTrackingCategory( $category ); + } elseif ( !$this->hasOption( 'dfn-only' ) ) { $new = $this->getOption( 'new-only', false ); $redir = $this->getOption( 'redirects-only', false ); $oldRedir = $this->getOption( 'old-redirects-only', false ); @@ -89,7 +101,6 @@ class RefreshLinks extends Maintenance { private function doRefreshLinks( $start, $newOnly = false, $end = null, $redirectsOnly = false, $oldRedirectsOnly = false ) { - $reportingInterval = 100; $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); if ( $start === null ) { @@ -124,7 +135,7 @@ class RefreshLinks extends Maintenance { $i = 0; foreach ( $res as $row ) { - if ( !( ++$i % $reportingInterval ) ) { + if ( !( ++$i % self::REPORTING_INTERVAL ) ) { $this->output( "$i\n" ); wfWaitForSlaves(); } @@ -145,7 +156,7 @@ class RefreshLinks extends Maintenance { $i = 0; foreach ( $res as $row ) { - if ( !( ++$i % $reportingInterval ) ) { + if ( !( ++$i % self::REPORTING_INTERVAL ) ) { $this->output( "$i\n" ); wfWaitForSlaves(); } @@ -166,7 +177,7 @@ class RefreshLinks extends Maintenance { for ( $id = $start; $id <= $end; $id++ ) { - if ( !( $id % $reportingInterval ) ) { + if ( !( $id % self::REPORTING_INTERVAL ) ) { $this->output( "$id\n" ); wfWaitForSlaves(); } @@ -179,7 +190,7 @@ class RefreshLinks extends Maintenance { for ( $id = $start; $id <= $end; $id++ ) { - if ( !( $id % $reportingInterval ) ) { + if ( !( $id % self::REPORTING_INTERVAL ) ) { $this->output( "$id\n" ); wfWaitForSlaves(); } @@ -379,6 +390,7 @@ class RefreshLinks extends Maintenance { * @param string $var Field name * @param mixed $start First value to include or null * @param mixed $end Last value to include or null + * @return string */ private static function intervalCond( IDatabase $db, $var, $start, $end ) { if ( $start === null && $end === null ) { @@ -391,6 +403,87 @@ class RefreshLinks extends Maintenance { return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}"; } } + + /** + * Refershes links for pages in a tracking category + * + * @param string $category Category key + */ + private function refreshTrackingCategory( $category ) { + $cats = $this->getPossibleCategories( $category ); + + if ( !$cats ) { + $this->error( "Tracking category '$category' is disabled\n" ); + // Output to stderr but don't bail out, + } + + foreach ( $cats as $cat ) { + $this->refreshCategory( $cat ); + } + } + + /** + * Refreshes links to a category + * + * @param Title $category + */ + private function refreshCategory( Title $category ) { + $this->output( "Refreshing pages in category '{$category->getText()}'...\n" ); + + $dbr = $this->getDB( DB_REPLICA ); + $conds = [ + 'page_id=cl_from', + 'cl_to' => $category->getDBkey(), + ]; + if ( $this->namespace !== false ) { + $conds['page_namespace'] = $this->namespace; + } + + $i = 0; + $timestamp = ''; + $lastId = 0; + do { + $finalConds = $conds; + $timestamp = $dbr->addQuotes( $timestamp ); + $finalConds []= + "(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))"; + $res = $dbr->select( [ 'page', 'categorylinks' ], + [ 'page_id', 'cl_timestamp' ], + $finalConds, + __METHOD__, + [ + 'ORDER BY' => [ 'cl_timestamp', 'cl_from' ], + 'LIMIT' => $this->mBatchSize, + ] + ); + + foreach ( $res as $row ) { + if ( !( ++$i % self::REPORTING_INTERVAL ) ) { + $this->output( "$i\n" ); + wfWaitForSlaves(); + } + $lastId = $row->page_id; + $timestamp = $row->cl_timestamp; + self::fixLinksFromArticle( $row->page_id ); + } + + } while ( $res->numRows() == $this->mBatchSize ); + } + + /** + * Returns a list of possible categories for a given tracking category key + * + * @param string $categoryKey + * @return Title[] + */ + private function getPossibleCategories( $categoryKey ) { + $trackingCategories = new TrackingCategories( $this->getConfig() ); + $cats = $trackingCategories->getTrackingCategories(); + if ( isset( $cats[$categoryKey] ) ) { + return $cats[$categoryKey]['cats']; + } + $this->error( "Unknown tracking category {$categoryKey}\n", true ); + } } $maintClass = 'RefreshLinks'; -- 2.20.1