Remove Revision::getRevisionText and gated pre-MCR schema access
authorPetr Pchelko <ppchelko@wikimedia.org>
Fri, 20 Sep 2019 18:11:35 +0000 (11:11 -0700)
committerPetr Pchelko <ppchelko@wikimedia.org>
Fri, 27 Sep 2019 16:40:02 +0000 (09:40 -0700)
- fixT22757.php appears to have been written for T22757 which has been
resolved, I assume it can be removed.

- compressOld.php - rev_text_id access is gated on MCRSchemaMigrationStage
READ_OLD. IT's safe to use the old_text and old_flags directly as we
select for them. Also the query ensures external blobs are not selected,
so we can use decompressData instead of Revision::getRevisionText.
Tested locally.

- recompressTracked.php uses a temporary table for tracking the blobs.
Table is populated by trackBlobs which explicitly queries for old_id,
so using it directly in recompressTracked is ok.
Tested locally.

- checkStorage.php the affected code attempts to fix broken revisions
from a dump, however, it looks like it doesn't work - the code expects
revision IDs as input, however it receives an array of text table old_id
values. Untested.

Bug: T198343
Change-Id: I753019565c15d270c831c995c07c7f1aad887cb6

autoload.php
maintenance/storage/checkStorage.php
maintenance/storage/compressOld.php
maintenance/storage/fixT22757.php [deleted file]
maintenance/storage/recompressTracked.php

index 55e5a7f..dc57ff6 100644 (file)
@@ -532,7 +532,6 @@ $wgAutoloadLocalClasses = [
        'FixDefaultJsonContentPages' => __DIR__ . '/maintenance/fixDefaultJsonContentPages.php',
        'FixDoubleRedirects' => __DIR__ . '/maintenance/fixDoubleRedirects.php',
        'FixExtLinksProtocolRelative' => __DIR__ . '/maintenance/fixExtLinksProtocolRelative.php',
-       'FixT22757' => __DIR__ . '/maintenance/storage/fixT22757.php',
        'FixTimestamps' => __DIR__ . '/maintenance/fixTimestamps.php',
        'FixUserRegistration' => __DIR__ . '/maintenance/fixUserRegistration.php',
        'ForeignAPIFile' => __DIR__ . '/includes/filerepo/file/ForeignAPIFile.php',
index 37625cf..3741959 100644 (file)
@@ -556,10 +556,27 @@ class CheckStorage {
 
                // Find text row again
                $dbr = wfGetDB( DB_REPLICA );
-               $oldId = $dbr->selectField( 'revision', 'rev_text_id', [ 'rev_id' => $id ], __METHOD__ );
+               global $wgMultiContentRevisionSchemaMigrationStage;
+               if ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_READ_OLD ) {
+                       $oldId = $dbr->selectField( 'revision', 'rev_text_id', [ 'rev_id' => $id ], __METHOD__ );
+               } else {
+                       $res = $dbr->selectRow(
+                               [ 'slots', 'content' ],
+                               [ 'content_address' ],
+                               [ 'slot_revision_id' => $id ],
+                               __METHOD__,
+                               [],
+                               [ 'content' => [ 'INNER JOIN', [ 'content_id = slot_content_id' ] ] ]
+                       );
+                       // @phan-suppress-next-line PhanAccessMethodInternal
+                       $blobStore = MediaWikiServices::getInstance()
+                               ->getBlobStoreFactory()
+                               ->newSqlBlobStore();
+                       $oldId = $blobStore->getTextIdFromAddress( $res->content_address );
+               }
+
                if ( !$oldId ) {
                        echo "Missing revision row for rev_id $id\n";
-
                        return;
                }
 
index b6aa626..c4779b9 100644 (file)
@@ -239,6 +239,10 @@ class CompressOld extends Maintenance {
                        /** @var ExternalStoreDB $storeObj */
                        $storeObj = $esFactory->getStore( 'DB' );
                }
+               // @phan-suppress-next-line PhanAccessMethodInternal
+               $blobStore = MediaWikiServices::getInstance()
+                       ->getBlobStoreFactory()
+                       ->newSqlBlobStore();
 
                # Get all articles by page_id
                if ( !$maxPageId ) {
@@ -370,8 +374,12 @@ class CompressOld extends Maintenance {
                                for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++ ) {
                                        $oldid = $revs[$i + $j]->old_id;
 
-                                       # Get text
-                                       $text = Revision::getRevisionText( $revs[$i + $j] );
+                                       # Get text. We do not need the full `extractBlob` since the query is built
+                                       # to fetch non-externalstore blobs.
+                                       $text = $blobStore->decompressData(
+                                               $revs[$i + $j]->old_text,
+                                               explode( ',', $revs[$i + $j]->old_flags )
+                                       );
 
                                        if ( $text === false ) {
                                                $this->error( "\nError, unable to get text in old_id $oldid" );
diff --git a/maintenance/storage/fixT22757.php b/maintenance/storage/fixT22757.php
deleted file mode 100644 (file)
index 61f1177..0000000
+++ /dev/null
@@ -1,335 +0,0 @@
-<?php
-/**
- * Script to fix T22757.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- * @ingroup Maintenance ExternalStorage
- */
-
-require_once __DIR__ . '/../Maintenance.php';
-
-/**
- * Maintenance script to fix T22757.
- *
- * @ingroup Maintenance ExternalStorage
- */
-class FixT22757 extends Maintenance {
-       public $batchSize = 10000;
-       public $mapCache = [];
-       public $mapCacheSize = 0;
-       public $maxMapCacheSize = 1000000;
-
-       function __construct() {
-               parent::__construct();
-               $this->addDescription( 'Script to fix T22757 assuming that blob_tracking is intact' );
-               $this->addOption( 'dry-run', 'Report only' );
-               $this->addOption( 'start', 'old_id to start at', false, true );
-       }
-
-       function execute() {
-               $dbr = $this->getDB( DB_REPLICA );
-               $dbw = $this->getDB( DB_MASTER );
-
-               $dryRun = $this->getOption( 'dry-run' );
-               if ( $dryRun ) {
-                       print "Dry run only.\n";
-               }
-
-               $startId = $this->getOption( 'start', 0 );
-               $numGood = 0;
-               $numFixed = 0;
-               $numBad = 0;
-
-               $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', '', __METHOD__ );
-
-               // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
-               $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
-
-               while ( true ) {
-                       print "ID: $startId / $totalRevs\r";
-
-                       $res = $dbr->select(
-                               'text',
-                               [ 'old_id', 'old_flags', 'old_text' ],
-                               [
-                                       'old_id > ' . intval( $startId ),
-                                       'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
-                                       "$lowerLeft = 'o:15:\"historyblobstub\"'",
-                               ],
-                               __METHOD__,
-                               [
-                                       'ORDER BY' => 'old_id',
-                                       'LIMIT' => $this->batchSize,
-                               ]
-                       );
-
-                       if ( !$res->numRows() ) {
-                               break;
-                       }
-
-                       $secondaryIds = [];
-                       $stubs = [];
-
-                       foreach ( $res as $row ) {
-                               $startId = $row->old_id;
-
-                               // Basic sanity checks
-                               $obj = unserialize( $row->old_text );
-                               if ( $obj === false ) {
-                                       print "{$row->old_id}: unrecoverable: cannot unserialize\n";
-                                       ++$numBad;
-                                       continue;
-                               }
-
-                               if ( !is_object( $obj ) ) {
-                                       print "{$row->old_id}: unrecoverable: unserialized to type " .
-                                               gettype( $obj ) . ", possible double-serialization\n";
-                                       ++$numBad;
-                                       continue;
-                               }
-
-                               if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
-                                       print "{$row->old_id}: unrecoverable: unexpected object class " .
-                                               get_class( $obj ) . "\n";
-                                       ++$numBad;
-                                       continue;
-                               }
-
-                               // Process flags
-                               $flags = explode( ',', $row->old_flags );
-                               if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
-                                       $legacyEncoding = false;
-                               } else {
-                                       $legacyEncoding = true;
-                               }
-
-                               // Queue the stub for future batch processing
-                               $id = intval( $obj->mOldId );
-                               $secondaryIds[] = $id;
-                               $stubs[$row->old_id] = [
-                                       'legacyEncoding' => $legacyEncoding,
-                                       'secondaryId' => $id,
-                                       'hash' => $obj->mHash,
-                               ];
-                       }
-
-                       $secondaryIds = array_unique( $secondaryIds );
-
-                       if ( !count( $secondaryIds ) ) {
-                               continue;
-                       }
-
-                       // Run the batch query on blob_tracking
-                       $res = $dbr->select(
-                               'blob_tracking',
-                               '*',
-                               [
-                                       'bt_text_id' => $secondaryIds,
-                               ],
-                               __METHOD__
-                       );
-                       $trackedBlobs = [];
-                       foreach ( $res as $row ) {
-                               $trackedBlobs[$row->bt_text_id] = $row;
-                       }
-
-                       // Process the stubs
-                       foreach ( $stubs as $primaryId => $stub ) {
-                               $secondaryId = $stub['secondaryId'];
-                               if ( !isset( $trackedBlobs[$secondaryId] ) ) {
-                                       // No tracked blob. Work out what went wrong
-                                       $secondaryRow = $dbr->selectRow(
-                                               'text',
-                                               [ 'old_flags', 'old_text' ],
-                                               [ 'old_id' => $secondaryId ],
-                                               __METHOD__
-                                       );
-                                       if ( !$secondaryRow ) {
-                                               print "$primaryId: unrecoverable: secondary row is missing\n";
-                                               ++$numBad;
-                                       } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
-                                               // Not broken yet, and not in the tracked clusters so it won't get
-                                               // broken by the current RCT run.
-                                               ++$numGood;
-                                       } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
-                                               print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
-                                               ++$numBad;
-                                       } else {
-                                               print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
-                                               ++$numBad;
-                                       }
-                                       unset( $stubs[$primaryId] );
-                                       continue;
-                               }
-                               $trackRow = $trackedBlobs[$secondaryId];
-
-                               // Check that the specified text really is available in the tracked source row
-                               $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
-                               $text = ExternalStore::fetchFromURL( $url );
-                               if ( $text === false ) {
-                                       print "$primaryId: unrecoverable: source text missing\n";
-                                       ++$numBad;
-                                       unset( $stubs[$primaryId] );
-                                       continue;
-                               }
-                               if ( md5( $text ) !== $stub['hash'] ) {
-                                       print "$primaryId: unrecoverable: content hashes do not match\n";
-                                       ++$numBad;
-                                       unset( $stubs[$primaryId] );
-                                       continue;
-                               }
-
-                               // Find the page_id and rev_id
-                               // The page is probably the same as the page of the secondary row
-                               $pageId = intval( $trackRow->bt_page );
-                               if ( !$pageId ) {
-                                       $revId = $pageId = 0;
-                               } else {
-                                       $revId = $this->findTextIdInPage( $pageId, $primaryId );
-                                       if ( !$revId ) {
-                                               // Actually an orphan
-                                               $pageId = $revId = 0;
-                                       }
-                               }
-
-                               $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
-
-                               if ( !$dryRun ) {
-                                       // Reset the text row to point to the original copy
-                                       $this->beginTransaction( $dbw, __METHOD__ );
-                                       $dbw->update(
-                                               'text',
-                                               // SET
-                                               [
-                                                       'old_flags' => $newFlags,
-                                                       'old_text' => $url
-                                               ],
-                                               // WHERE
-                                               [ 'old_id' => $primaryId ],
-                                               __METHOD__
-                                       );
-
-                                       // Add a blob_tracking row so that the new reference can be recompressed
-                                       // without needing to run trackBlobs.php again
-                                       $dbw->insert( 'blob_tracking',
-                                               [
-                                                       'bt_page' => $pageId,
-                                                       'bt_rev_id' => $revId,
-                                                       'bt_text_id' => $primaryId,
-                                                       'bt_cluster' => $trackRow->bt_cluster,
-                                                       'bt_blob_id' => $trackRow->bt_blob_id,
-                                                       'bt_cgz_hash' => $stub['hash'],
-                                                       'bt_new_url' => null,
-                                                       'bt_moved' => 0,
-                                               ],
-                                               __METHOD__
-                                       );
-                                       $this->commitTransaction( $dbw, __METHOD__ );
-                               }
-
-                               print "$primaryId: resolved to $url\n";
-                               ++$numFixed;
-                       }
-               }
-
-               print "\n";
-               print "Fixed: $numFixed\n";
-               print "Unrecoverable: $numBad\n";
-               print "Good stubs: $numGood\n";
-       }
-
-       function findTextIdInPage( $pageId, $textId ) {
-               $ids = $this->getRevTextMap( $pageId );
-               return $ids[$textId] ?? null;
-       }
-
-       function getRevTextMap( $pageId ) {
-               if ( !isset( $this->mapCache[$pageId] ) ) {
-                       // Limit cache size
-                       while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
-                               $key = key( $this->mapCache );
-                               $this->mapCacheSize -= count( $this->mapCache[$key] );
-                               unset( $this->mapCache[$key] );
-                       }
-
-                       $dbr = $this->getDB( DB_REPLICA );
-                       $map = [];
-                       $res = $dbr->select( 'revision',
-                               [ 'rev_id', 'rev_text_id' ],
-                               [ 'rev_page' => $pageId ],
-                               __METHOD__
-                       );
-                       foreach ( $res as $row ) {
-                               $map[$row->rev_text_id] = $row->rev_id;
-                       }
-                       $this->mapCache[$pageId] = $map;
-                       $this->mapCacheSize += count( $map );
-               }
-
-               return $this->mapCache[$pageId];
-       }
-
-       /**
-        * This is based on part of HistoryBlobStub::getText().
-        * Determine if the text can be retrieved from the row in the normal way.
-        * @param array $stub
-        * @param stdClass $secondaryRow
-        * @return bool
-        */
-       function isUnbrokenStub( $stub, $secondaryRow ) {
-               $flags = explode( ',', $secondaryRow->old_flags );
-               $text = $secondaryRow->old_text;
-               if ( in_array( 'external', $flags ) ) {
-                       $url = $text;
-                       Wikimedia\suppressWarnings();
-                       list( /* $proto */, $path ) = explode( '://', $url, 2 );
-                       Wikimedia\restoreWarnings();
-
-                       if ( $path == "" ) {
-                               return false;
-                       }
-                       $text = ExternalStore::fetchFromURL( $url );
-               }
-               if ( !in_array( 'object', $flags ) ) {
-                       return false;
-               }
-
-               if ( in_array( 'gzip', $flags ) ) {
-                       $obj = unserialize( gzinflate( $text ) );
-               } else {
-                       $obj = unserialize( $text );
-               }
-
-               if ( !is_object( $obj ) ) {
-                       // Correct for old double-serialization bug.
-                       $obj = unserialize( $obj );
-               }
-
-               if ( !is_object( $obj ) ) {
-                       return false;
-               }
-
-               $obj->uncompress();
-               $text = $obj->getItem( $stub['hash'] );
-
-               return $text !== false;
-       }
-}
-
-$maintClass = FixT22757::class;
-require_once RUN_MAINTENANCE_IF_MAIN;
index 9f20e67..5cd6ef3 100644 (file)
@@ -22,6 +22,7 @@
  * @ingroup Maintenance ExternalStorage
  */
 
+use MediaWiki\Storage\SqlBlobStore;
 use Wikimedia\Rdbms\IMaintainableDatabase;
 use MediaWiki\Logger\LegacyLogger;
 use MediaWiki\MediaWikiServices;
@@ -71,6 +72,8 @@ class RecompressTracked {
        public $debugLog, $infoLog, $criticalLog;
        /** @var ExternalStoreDB */
        public $store;
+       /** @var SqlBlobStore */
+       private $blobStore;
 
        private static $optionsWithArgs = [
                'procs',
@@ -120,6 +123,10 @@ class RecompressTracked {
                $this->pageBlobClass = function_exists( 'xdiff_string_bdiff' ) ?
                        DiffHistoryBlob::class : ConcatenatedGzipHistoryBlob::class;
                $this->orphanBlobClass = ConcatenatedGzipHistoryBlob::class;
+               // @phan-suppress-next-line PhanAccessMethodInternal
+               $this->blobStore = MediaWikiServices::getInstance()
+                       ->getBlobStoreFactory()
+                       ->newSqlBlobStore();
        }
 
        function debug( $msg ) {
@@ -531,7 +538,7 @@ class RecompressTracked {
                                }
                                $lastTextId = $row->bt_text_id;
                                // Load the text
-                               $text = Revision::getRevisionText( $row );
+                               $text = $this->blobStore->expandBlob( $row->old_text, $row->old_flags );
                                if ( $text === false ) {
                                        $this->critical( "Error loading {$row->bt_rev_id}/{$row->bt_text_id}" );
                                        continue;
@@ -685,7 +692,7 @@ class RecompressTracked {
                );
 
                foreach ( $res as $row ) {
-                       $text = Revision::getRevisionText( $row );
+                       $text = $this->blobStore->expandBlob( $row->old_text, $row->old_flags );
                        if ( $text === false ) {
                                $this->critical( "Error: cannot load revision text for old_id={$row->old_id}" );
                                continue;