From c22a2c9f752d56165490d5df786b46db76cc13ba Mon Sep 17 00:00:00 2001 From: aude Date: Thu, 11 Jan 2018 19:56:20 -0500 Subject: [PATCH] [MCR] populateContentTables maintenance script This introduces a maintenance script for populating the tables introduced by the MCR storage schema, namely: slots slot_roles content content_models Per default, both the revision and archive tables are processed. This script is part of the MCR schema migration: after instructing RevisionStroe to write both the old and the new schema by setting $wgMultiContentRevisionSchemaMigrationStage = MIGRATION_WRITE_BOTH, this script can be used to back-fill the new schema for existing revisions. Doing this is a precondition to later setting $wgMultiContentRevisionSchemaMigrationStage = MIGRATION_NEW to complete the schema migration. Bug: T182682 Change-Id: Iecc67c1b8c082be1a1039eeb52e76ad16b965226 --- autoload.php | 1 + includes/DefaultSettings.php | 11 + maintenance/populateContentTables.php | 330 ++++++++++++++++++++++++++ 3 files changed, 342 insertions(+) create mode 100644 maintenance/populateContentTables.php diff --git a/autoload.php b/autoload.php index 77144dfd86..46a264c19b 100644 --- a/autoload.php +++ b/autoload.php @@ -1100,6 +1100,7 @@ $wgAutoloadLocalClasses = [ 'PopulateBacklinkNamespace' => __DIR__ . '/maintenance/populateBacklinkNamespace.php', 'PopulateCategory' => __DIR__ . '/maintenance/populateCategory.php', 'PopulateContentModel' => __DIR__ . '/maintenance/populateContentModel.php', + 'PopulateContentTables' => __DIR__ . '/maintenance/populateContentTables.php', 'PopulateExternallinksIndex60' => __DIR__ . '/maintenance/populateExternallinksIndex60.php', 'PopulateFilearchiveSha1' => __DIR__ . '/maintenance/populateFilearchiveSha1.php', 'PopulateImageSha1' => __DIR__ . '/maintenance/populateImageSha1.php', diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 562d887e17..23a021c486 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -8875,6 +8875,17 @@ $wgInterwikiPrefixDisplayTypes = []; */ $wgCommentTableSchemaMigrationStage = MIGRATION_OLD; +/** + * RevisionStore table schema migration stage (content, slots, content_models & slot_roles tables) + * + * @see Task: https://phabricator.wikimedia.org/T174028 + * @see Commit: https://gerrit.wikimedia.org/r/#/c/378724/ + * + * @since 1.32 + * @var int One of the MIGRATION_* constants + */ +$wgMultiContentRevisionSchemaMigrationStage = MIGRATION_OLD; + /** * Actor table schema migration stage. * @since 1.31 diff --git a/maintenance/populateContentTables.php b/maintenance/populateContentTables.php new file mode 100644 index 0000000000..eee534ff67 --- /dev/null +++ b/maintenance/populateContentTables.php @@ -0,0 +1,330 @@ +addDescription( 'Populate content and slot tables' ); + $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false, + true ); + $this->addOption( 'reuse-content', + 'Reuse content table rows when the address and model are the same. ' + . 'This will increase the script\'s time and memory usage, perhaps significantly.', + false, false ); + $this->setBatchSize( 500 ); + } + + private function initServices() { + $this->dbw = $this->getDB( DB_MASTER ); + $this->contentModelStore = MediaWikiServices::getInstance()->getContentModelStore(); + $this->mainRoleId = MediaWikiServices::getInstance()->getSlotRoleStore()->acquireId( 'main' ); + } + + public function execute() { + global $wgMultiContentRevisionSchemaMigrationStage; + + $t0 = microtime( true ); + + if ( $wgMultiContentRevisionSchemaMigrationStage < MIGRATION_WRITE_BOTH ) { + $this->writeln( + "...cannot update while \$wgMultiContentRevisionSchemaMigrationStage < MIGRATION_WRITE_BOTH" + ); + return false; + } + + $this->initServices(); + + if ( $this->getOption( 'reuse-content', false ) ) { + $this->loadContentMap(); + } + + foreach ( $this->getTables() as $table ) { + $this->populateTable( $table ); + } + + $elapsed = microtime( true ) - $t0; + $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" ); + } + + /** + * @return string[] + */ + private function getTables() { + $table = $this->getOption( 'table', 'all' ); + $validTableOptions = [ 'all', 'revision', 'archive' ]; + + if ( !in_array( $table, $validTableOptions ) ) { + $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' ); + } + + if ( $table === 'all' ) { + $tables = [ 'revision', 'archive' ]; + } else { + $tables = [ $table ]; + } + + return $tables; + } + + private function loadContentMap() { + $t0 = microtime( true ); + $this->writeln( "Loading existing content table rows..." ); + $this->contentRowMap = []; + $dbr = $this->getDB( DB_REPLICA ); + $from = false; + while ( true ) { + $res = $dbr->select( + 'content', + [ 'content_id', 'content_address', 'content_model' ], + $from ? "content_id > $from" : '', + __METHOD__, + [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ] + ); + if ( !$res || !$res->numRows() ) { + break; + } + foreach ( $res as $row ) { + $from = $row->content_id; + $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id; + } + } + $elapsed = microtime( true ) - $t0; + $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" ); + } + + /** + * @param string $table + */ + private function populateTable( $table ) { + $t0 = microtime( true ); + $this->count = 0; + $this->writeln( "Populating $table..." ); + + if ( $table === 'revision' ) { + $idField = 'rev_id'; + $tables = [ 'revision', 'slots', 'page' ]; + $fields = [ + 'rev_id', + 'len' => 'rev_len', + 'sha1' => 'rev_sha1', + 'text_id' => 'rev_text_id', + 'content_model' => 'rev_content_model', + 'namespace' => 'page_namespace', + 'title' => 'page_title', + ]; + $joins = [ + 'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ], + 'page' => [ 'LEFT JOIN', 'rev_page=page_id' ], + ]; + } else { + $idField = 'ar_rev_id'; + $tables = [ 'archive', 'slots' ]; + $fields = [ + 'rev_id' => 'ar_rev_id', + 'len' => 'ar_len', + 'sha1' => 'ar_sha1', + 'text_id' => 'ar_text_id', + 'content_model' => 'ar_content_model', + 'namespace' => 'ar_namespace', + 'title' => 'ar_title', + ]; + $joins = [ + 'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ], + ]; + } + + $minmax = $this->dbw->selectRow( + $table, + [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ], + '', + __METHOD__ + ); + $batchSize = $this->getBatchSize(); + + for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) { + $endId = min( $startId + $batchSize - 1, $minmax->max ); + $rows = $this->dbw->select( + $tables, + $fields, + [ + "$idField >= $startId", + "$idField <= $endId", + 'slot_revision_id IS NULL', + ], + __METHOD__, + [ 'ORDER BY' => 'rev_id' ], + $joins + ); + if ( $rows->numRows() !== 0 ) { + $this->populateContentTablesForRowBatch( $rows, $startId, $table ); + } + + $elapsed = microtime( true ) - $t0; + $this->writeln( + "... $table processed up to revision id $endId of {$minmax->max}" + . " ($this->count rows in $elapsed seconds)" + ); + } + + $elapsed = microtime( true ) - $t0; + $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" ); + } + + /** + * @param ResultWrapper $rows + * @param int $startId + * @param string $table + * @return int|null + */ + private function populateContentTablesForRowBatch( ResultWrapper $rows, $startId, $table ) { + $this->beginTransaction( $this->dbw, __METHOD__ ); + + if ( $this->contentRowMap === null ) { + $map = []; + } else { + $map = &$this->contentRowMap; + } + $contentKeys = []; + + try { + // Step 1: Figure out content rows needing insertion. + $contentRows = []; + foreach ( $rows as $row ) { + $revisionId = $row->rev_id; + + Assert::invariant( $revisionId !== null, 'rev_id must not be null' ); + + $modelId = $this->contentModelStore->acquireId( $this->getContentModel( $row ) ); + $address = SqlBlobStore::makeAddressFromTextId( $row->text_id ); + + $key = "{$modelId}:{$address}"; + $contentKeys[$revisionId] = $key; + + if ( !isset( $map[$key] ) ) { + $map[$key] = false; + $contentRows[] = [ + 'content_size' => (int)$row->len, + 'content_sha1' => $row->sha1, + 'content_model' => $modelId, + 'content_address' => $address, + ]; + } + } + + // Step 2: Insert them, then read them back in for use in the next step. + if ( $contentRows ) { + $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ ); + $this->dbw->insert( 'content', $contentRows, __METHOD__ ); + $res = $this->dbw->select( + 'content', + [ 'content_id', 'content_model', 'content_address' ], + 'content_id > ' . (int)$id, + __METHOD__ + ); + foreach ( $res as $row ) { + $key = $row->content_model . ':' . $row->content_address; + $map[$key] = $row->content_id; + } + } + + // Step 3: Insert the slot rows. + $slotRows = []; + foreach ( $rows as $row ) { + $revisionId = $row->rev_id; + $contentId = $map[$contentKeys[$revisionId]] ?? false; + if ( $contentId === false ) { + throw new \RuntimeException( "Content row for $revisionId not found after content insert" ); + } + $slotRows[] = [ + 'slot_revision_id' => $revisionId, + 'slot_role_id' => $this->mainRoleId, + 'slot_content_id' => $contentId, + // There's no way to really know the previous revision, so assume no inheriting. + // rev_parent_id can get changed on undeletions, and deletions can screw up + // rev_timestamp ordering. + 'slot_origin' => $revisionId, + ]; + } + $this->dbw->insert( 'slots', $slotRows, __METHOD__ ); + $this->count += count( $slotRows ); + $this->totalCount += count( $slotRows ); + } catch ( \Exception $e ) { + $this->rollbackTransaction( $this->dbw, __METHOD__ ); + $this->fatalError( "Failed to populate content table $table row batch starting at $startId " + . "due to exception: " . $e->__toString() ); + } + + $this->commitTransaction( $this->dbw, __METHOD__ ); + } + + /** + * @param \stdClass $row + * @return string + */ + private function getContentModel( $row ) { + if ( isset( $row->content_model ) ) { + return $row->content_model; + } + + $title = Title::makeTitle( $row->namespace, $row->title ); + + return ContentHandler::getDefaultModelFor( $title ); + } + + /** + * @param string $msg + */ + private function writeln( $msg ) { + $this->output( "$msg\n" ); + } +} + +$maintClass = 'PopulateContentTables'; +require_once RUN_MAINTENANCE_IF_MAIN; -- 2.20.1