From 30bb36f210e562af9ce0ecb2c8c0e39f69195510 Mon Sep 17 00:00:00 2001 From: daniel Date: Tue, 23 Jul 2019 13:23:41 +0200 Subject: [PATCH] Make XmlDumpwriter resilient to blob store corruption. In the WMF databases, we have several revisions for which we cannot load the content. They typically (but not necessarily) have content_address = "tt:0" and content_sha1 = "" and rev_sha1 = "" and content_size = 0 and rev_len = 0. This patch makes sure we can still generate dumps in the presence of such revisions. Bug: T228720 Change-Id: Iaadad44eb5b5fe5a4f2e60da406ffc11f39c735b --- includes/export/XmlDumpWriter.php | 114 +++++++++++++----- tests/phpunit/maintenance/backup_PageTest.php | 97 ++++++++++++++- 2 files changed, 173 insertions(+), 38 deletions(-) diff --git a/includes/export/XmlDumpWriter.php b/includes/export/XmlDumpWriter.php index 85355642bd..f34b3bd9d2 100644 --- a/includes/export/XmlDumpWriter.php +++ b/includes/export/XmlDumpWriter.php @@ -291,6 +291,34 @@ class XmlDumpWriter { return MediaWikiServices::getInstance()->getBlobStore(); } + /** + * Invokes the given method on the given object, catching and logging any storage related + * exceptions. + * + * @param object $obj + * @param string $method + * @param array $args + * @param string $warning The warning to output in case of a storage related exception. + * + * @return mixed Returns the method's return value, + * or null in case of a storage related exception. + * @throws Exception + */ + private function invokeLenient( $obj, $method, $args = [], $warning ) { + try { + return call_user_func_array( [ $obj, $method ], $args ); + } catch ( SuppressedDataException $ex ) { + return null; + } catch ( Exception $ex ) { + if ( $ex instanceof MWException || $ex instanceof RuntimeException ) { + MWDebug::warning( $warning . ': ' . $ex->getMessage() ); + return null; + } else { + throw $ex; + } + } + } + /** * Dumps a "" section on the output stream, with * data filled in from the given database row. @@ -354,14 +382,28 @@ class XmlDumpWriter { if ( $rev->isDeleted( RevisionRecord::DELETED_TEXT ) ) { $out .= " \n"; } else { - $out .= " " . Xml::element( 'sha1', null, strval( $rev->getSha1() ) ) . "\n"; + $sha1 = $this->invokeLenient( + $rev, + 'getSha1', + [], + 'failed to determine sha1 for revision ' . $rev->getId() + ); + $out .= " " . Xml::element( 'sha1', null, strval( $sha1 ) ) . "\n"; } // Avoid PHP 7.1 warning from passing $this by reference $writer = $this; $text = ''; if ( $contentMode === self::WRITE_CONTENT ) { - $text = $rev->getContent( SlotRecord::MAIN, RevisionRecord::RAW ); + /** @var Content $content */ + $content = $this->invokeLenient( + $rev, + 'getContent', + [ SlotRecord::MAIN, RevisionRecord::RAW ], + 'Failed to load main slot content of revision ' . $rev->getId() + ); + + $text = $content ? $content->serialize() : ''; } Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text, $rev ] ); @@ -410,37 +452,38 @@ class XmlDumpWriter { $textAttributes = [ 'xml:space' => 'preserve', - 'bytes' => $slot->getSize(), + 'bytes' => $this->invokeLenient( + $slot, + 'getSize', + [], + 'failed to determine size for slot ' . $slot->getRole() . ' of revision ' + . $slot->getRevision() + ) ?: '0' ]; if ( $isV11 ) { - $textAttributes['sha1'] = $slot->getSha1(); + $textAttributes['sha1'] = $this->invokeLenient( + $slot, + 'getSha1', + [], + 'failed to determine sha1 for slot ' . $slot->getRole() . ' of revision ' + . $slot->getRevision() + ) ?: ''; } if ( $contentMode === self::WRITE_CONTENT ) { - try { - // write tag - $out .= $this->writeText( $slot->getContent(), $textAttributes, $indent ); - } catch ( SuppressedDataException $ex ) { - // NOTE: this shouldn't happen, since the caller is supposed to have checked - // for suppressed content! - // write placeholder tag - $textAttributes['deleted'] = 'deleted'; + $content = $this->invokeLenient( + $slot, + 'getContent', + [], + 'failed to load content for slot ' . $slot->getRole() . ' of revision ' + . $slot->getRevision() + ); + + if ( $content === null ) { $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n"; - } - catch ( Exception $ex ) { - if ( $ex instanceof MWException || $ex instanceof RuntimeException ) { - // there's no provision in the schema for an attribute that will let - // the user know this element was unavailable due to error; an empty - // tag is the best we can do - $out .= $indent . Xml::element( 'text' ) . "\n"; - wfLogWarning( - 'failed to load content slot ' . $slot->getRole() . ' for revision ' - . $slot->getRevision() . "\n" - ); - } else { - throw $ex; - } + } else { + $out .= $this->writeText( $content, $textAttributes, $indent ); } } elseif ( $contentMode === self::WRITE_STUB_DELETED ) { // write placeholder tag @@ -455,14 +498,21 @@ class XmlDumpWriter { // Output the numerical text ID if possible, for backwards compatibility. // Note that this is currently the ONLY reason we have a BlobStore here at all. // When removing this line, check whether the BlobStore has become unused. - $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() ); + try { + // NOTE: this will only work for addresses of the form "tt:12345". + // If we want to support other kinds of addresses in the future, + // we will have to silently ignore failures here. + // For now, this fails for "tt:0", which is present in the WMF production + // database of of Juli 2019, due to data corruption. + $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() ); + } catch ( InvalidArgumentException $ex ) { + MWDebug::warning( 'Bad content address for slot ' . $slot->getRole() + . ' of revision ' . $slot->getRevision() . ': ' . $ex->getMessage() ); + $textId = 0; + } + if ( $textId ) { $textAttributes['id'] = $textId; - } elseif ( !$isV11 ) { - throw new InvalidArgumentException( - 'Cannot produce stubs for non-text-table content blobs with schema version ' - . $this->schemaVersion - ); } $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n"; diff --git a/tests/phpunit/maintenance/backup_PageTest.php b/tests/phpunit/maintenance/backup_PageTest.php index 7a78e524a5..54a362e705 100644 --- a/tests/phpunit/maintenance/backup_PageTest.php +++ b/tests/phpunit/maintenance/backup_PageTest.php @@ -6,6 +6,7 @@ use DumpBackup; use Exception; use MediaWiki\MediaWikiServices; use MediaWiki\Revision\RevisionRecord; +use MediaWiki\Revision\SlotRecord; use MediaWikiTestCase; use MWException; use RequestContext; @@ -28,13 +29,14 @@ class BackupDumperPageTest extends DumpTestCase { // We'll add several pages, revision and texts. The following variables hold the // corresponding ids. - private $pageId1, $pageId2, $pageId3, $pageId4; - private $pageTitle1, $pageTitle2, $pageTitle3, $pageTitle4; + private $pageId1, $pageId2, $pageId3, $pageId4, $pageId5; + private $pageTitle1, $pageTitle2, $pageTitle3, $pageTitle4, $pageTitle5; private $revId1_1, $textId1_1; private $revId2_1, $textId2_1, $revId2_2, $textId2_2; private $revId2_3, $textId2_3, $revId2_4, $textId2_4; private $revId3_1, $textId3_1, $revId3_2, $textId3_2; private $revId4_1, $textId4_1; + private $revId5_1, $textId5_1; private $namespace, $talk_namespace; /** @@ -106,6 +108,15 @@ class BackupDumperPageTest extends DumpTestCase { "Talk about BackupDumperTestP1 Text1", "Talk BackupDumperTestP1 Summary1" ); $this->pageId4 = $page->getId(); + + $this->pageTitle5 = Title::newFromText( 'BackupDumperTestP5' ); + $page = WikiPage::factory( $this->pageTitle5 ); + list( $this->revId5_1, $this->textId5_1 ) = $this->addRevision( $page, + "BackupDumperTestP5 Text1", + "BackupDumperTestP5 Summary1" ); + $this->pageId5 = $page->getId(); + + $this->corruptRevisionData( $page->getRevision()->getRevisionRecord() ); } catch ( Exception $e ) { // We'd love to pass $e directly. However, ... see // documentation of exceptionFromAddDBData in @@ -114,6 +125,39 @@ class BackupDumperPageTest extends DumpTestCase { } } + /** + * Corrupt the information about the given revision in the database. + * + * @param RevisionRecord $revision + */ + private function corruptRevisionData( RevisionRecord $revision ) { + global $wgMultiContentRevisionSchemaMigrationStage; + + if ( ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_OLD ) ) { + $this->db->update( + 'revision', + [ + 'rev_text_id' => 0, + 'rev_sha1' => '', + 'rev_len' => '0', + ], + [ 'rev_id' => $revision->getId() ] + ); + } + + if ( ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_NEW ) ) { + $this->db->update( + 'content', + [ + 'content_address' => 'tt:0', + 'content_sha1' => '', + 'content_size' => '0', + ], + [ 'content_id' => $revision->getSlot( SlotRecord::MAIN )->getContentId() ] + ); + } + } + protected function setUp() { parent::setUp(); @@ -201,11 +245,14 @@ class BackupDumperPageTest extends DumpTestCase { $dumper = $this->newDumpBackup( [ '--full', '--quiet', '--output', 'file:' . $fname, '--schema-version', $schemaVersion ], $this->pageId1, - $this->pageId4 + 1 + $this->pageId5 + 1 ); - // Performing the dump + // Performing the dump. Suppress warnings, since we want to test + // accessing broken revision data (page 5). + $this->setMwGlobals( 'wgDevelopmentWarnings', false ); $dumper->execute(); + $this->setMwGlobals( 'wgDevelopmentWarnings', true ); // Checking the dumped data $this->assertDumpSchema( $fname, $this->getXmlSchemaPath( $schemaVersion ) ); @@ -295,6 +342,26 @@ class BackupDumperPageTest extends DumpTestCase { ); $asserter->assertPageEnd(); + // Page 5 (broken revision data) + $asserter->assertPageStart( + $this->pageId5, + $this->namespace, + $this->pageTitle5->getPrefixedText() + ); + $asserter->assertRevision( + $this->revId5_1, + "BackupDumperTestP5 Summary1", + null, + 0, + "", + false, + false, + CONTENT_MODEL_WIKITEXT, + CONTENT_FORMAT_WIKITEXT, + $schemaVersion + ); + $asserter->assertPageEnd(); + $asserter->assertDumpEnd(); // FIXME: add multi-slot test case! @@ -317,11 +384,14 @@ class BackupDumperPageTest extends DumpTestCase { '--schema-version', $schemaVersion, ], $this->pageId1, - $this->pageId4 + 1 + $this->pageId5 + 1 ); - // Performing the dump + // Performing the dump. Suppress warnings, since we want to test + // accessing broken revision data (page 5). + $this->setMwGlobals( 'wgDevelopmentWarnings', false ); $dumper->execute(); + $this->setMwGlobals( 'wgDevelopmentWarnings', true ); // Checking the dumped data $this->assertDumpSchema( $fname, $this->getXmlSchemaPath( $schemaVersion ) ); @@ -404,6 +474,21 @@ class BackupDumperPageTest extends DumpTestCase { ); $asserter->assertPageEnd(); + // Page 5 (broken revision data) + $asserter->assertPageStart( + $this->pageId5, + $this->namespace, + $this->pageTitle5->getPrefixedText() + ); + $asserter->assertRevision( + $this->revId5_1, + "BackupDumperTestP5 Summary1", + null, + 0, + "" + ); + $asserter->assertPageEnd(); + $asserter->assertDumpEnd(); } -- 2.20.1