From: daniel Date: Fri, 5 Oct 2018 08:36:37 +0000 (+0200) Subject: Add support for xml dump schema 0.11 X-Git-Tag: 1.34.0-rc.0~1259^2 X-Git-Url: https://git.cyclocoop.org/%20%27.%28%24debut%20%20%20%24par_page%29.%27?a=commitdiff_plain;h=fdc3e9f9524d91a492bdc212486d4518991c0fe2;p=lhc%2Fweb%2Fwiklou.git Add support for xml dump schema 0.11 Bug: T174031 Change-Id: I2717019ea7efe36694bd2b2fba4dc2952a987cfc --- diff --git a/docs/export-0.11.xsd b/docs/export-0.11.xsd new file mode 100644 index 0000000000..6dbc63b789 --- /dev/null +++ b/docs/export-0.11.xsd @@ -0,0 +1,335 @@ + + + + + + + MediaWiki's page export format + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/hooks.txt b/docs/hooks.txt index 99a3d1abc7..6f8c4ca1fa 100644 --- a/docs/hooks.txt +++ b/docs/hooks.txt @@ -3985,8 +3985,9 @@ $title: The title of the page. add extra metadata. &$obj: The XmlDumpWriter object. &$out: The text being output. -$row: The database row for the revision. -$text: The revision text. +$row: The database row for the revision being dumped. DEPRECATED, use $rev instead. +$text: The revision text to be dumped. DEPRECATED, use $rev instead. +$rev: The RevisionRecord that is being dumped to XML More hooks might be available but undocumented, you can execute "php maintenance/findHooks.php" to find hidden ones. diff --git a/includes/Defines.php b/includes/Defines.php index e5cd5ed64d..648e493b91 100644 --- a/includes/Defines.php +++ b/includes/Defines.php @@ -322,4 +322,5 @@ define( 'MIGRATION_NEW', 0x30000000 | SCHEMA_COMPAT_NEW ); * were already unsupported at the time these constants were introduced. */ define( 'XML_DUMP_SCHEMA_VERSION_10', '0.10' ); +define( 'XML_DUMP_SCHEMA_VERSION_11', '0.11' ); /**@}*/ diff --git a/includes/export/WikiExporter.php b/includes/export/WikiExporter.php index 0b0c8014b9..f834fb1e5e 100644 --- a/includes/export/WikiExporter.php +++ b/includes/export/WikiExporter.php @@ -53,8 +53,8 @@ class WikiExporter { const LOGS = 8; const RANGE = 16; - const TEXT = 0; - const STUB = 1; + const TEXT = XmlDumpWriter::WRITE_CONTENT; + const STUB = XmlDumpWriter::WRITE_STUB; const BATCH_SIZE = 50000; diff --git a/includes/export/XmlDumpWriter.php b/includes/export/XmlDumpWriter.php index d1b993d99e..bedfe133c7 100644 --- a/includes/export/XmlDumpWriter.php +++ b/includes/export/XmlDumpWriter.php @@ -23,21 +23,46 @@ * @file */ use MediaWiki\MediaWikiServices; +use MediaWiki\Revision\RevisionRecord; use MediaWiki\Revision\RevisionStore; +use MediaWiki\Revision\SlotRecord; +use MediaWiki\Revision\SuppressedDataException; use MediaWiki\Storage\SqlBlobStore; +use Wikimedia\Assert\Assert; /** * @ingroup Dump */ class XmlDumpWriter { + + /** Output serialized revision content. */ + const WRITE_CONTENT = 0; + + /** Only output subs for revision content. */ + const WRITE_STUB = 1; + + /** + * Only output subs for revision content, indicating that the content has been + * deleted/suppressed. For internal use only. + */ + const WRITE_STUB_DELETED = 2; + /** * @var string[] the schema versions supported for output * @final */ public static $supportedSchemas = [ XML_DUMP_SCHEMA_VERSION_10, + XML_DUMP_SCHEMA_VERSION_11 ]; + /** + * @var string which schema version the generated XML should comply to. + * One of the values from self::$supportedSchemas, using the SCHEMA_VERSION_XX + * constants. + */ + private $schemaVersion; + /** * Title of the currently processed page * @@ -45,6 +70,40 @@ class XmlDumpWriter { */ private $currentTitle = null; + /** + * @var int Whether to output revision content or just stubs. WRITE_CONTENT or WRITE_STUB. + */ + private $contentMode; + + /** + * XmlDumpWriter constructor. + * + * @param int $contentMode WRITE_CONTENT or WRITE_STUB. + * @param string $schemaVersion which schema version the generated XML should comply to. + * One of the values from self::$supportedSchemas, using the XML_DUMP_SCHEMA_VERSION_XX + * constants. + */ + public function __construct( + $contentMode = self::WRITE_CONTENT, + $schemaVersion = XML_DUMP_SCHEMA_VERSION_11 + ) { + Assert::parameter( + in_array( $contentMode, [ self::WRITE_CONTENT, self::WRITE_STUB ] ), + '$contentMode', + 'must be one of the following constants: WRITE_CONTENT or WRITE_STUB.' + ); + + Assert::parameter( + in_array( $schemaVersion, self::$supportedSchemas ), + '$schemaVersion', + 'must be one of the following schema versions: ' + . implode( ',', self::$supportedSchemas ) + ); + + $this->contentMode = $contentMode; + $this->schemaVersion = $schemaVersion; + } + /** * Opens the XML output stream's root "" element. * This does not include an xml directive, so is safe to include @@ -56,7 +115,7 @@ class XmlDumpWriter { * @return string */ function openStream() { - $ver = WikiExporter::schemaVersion(); + $ver = $this->schemaVersion; return Xml::element( 'mediawiki', [ 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/", 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance", @@ -253,137 +312,188 @@ class XmlDumpWriter { ); $out = " \n"; - $out .= " " . Xml::element( 'id', null, strval( $row->rev_id ) ) . "\n"; - if ( isset( $row->rev_parent_id ) && $row->rev_parent_id ) { - $out .= " " . Xml::element( 'parentid', null, strval( $row->rev_parent_id ) ) . "\n"; + $out .= " " . Xml::element( 'id', null, strval( $rev->getId() ) ) . "\n"; + + if ( $rev->getParentId() ) { + $out .= " " . Xml::element( 'parentid', null, strval( $rev->getParentId() ) ) . "\n"; } - $out .= $this->writeTimestamp( $row->rev_timestamp ); + $out .= $this->writeTimestamp( $rev->getTimestamp() ); - if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_USER ) ) { + if ( $rev->isDeleted( Revision::DELETED_USER ) ) { $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n"; } else { // empty values get written out as uid 0, see T224221 - $out .= $this->writeContributor( $row->rev_user ?: 0, $row->rev_user_text ); + $user = $rev->getUser(); + $out .= $this->writeContributor( + $user ? $user->getId() : 0, + $user ? $user->getName() : '' + ); } - if ( isset( $row->rev_minor_edit ) && $row->rev_minor_edit ) { + if ( $rev->isMinor() ) { $out .= " \n"; } - if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_COMMENT ) ) { + if ( $rev->isDeleted( Revision::DELETED_COMMENT ) ) { $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n"; } else { - $comment = CommentStore::getStore()->getComment( 'rev_comment', $row )->text; - if ( $comment != '' ) { - $out .= " " . Xml::elementClean( 'comment', [], strval( $comment ) ) . "\n"; - } + $out .= " " + . Xml::elementClean( 'comment', [], strval( $rev->getComment()->text ) ) + . "\n"; + } + + $contentMode = $rev->isDeleted( Revision::DELETED_TEXT ) ? self::WRITE_STUB_DELETED + : $this->contentMode; + + foreach ( $rev->getSlots()->getSlots() as $slot ) { + $out .= $this->writeSlot( $slot, $contentMode ); } - // TODO: rev_content_model no longer exists with MCR, see T174031 - if ( isset( $row->rev_content_model ) && !is_null( $row->rev_content_model ) ) { - $content_model = strval( $row->rev_content_model ); + if ( $rev->isDeleted( Revision::DELETED_TEXT ) ) { + $out .= " \n"; } else { - // probably using $wgContentHandlerUseDB = false; - $content_model = ContentHandler::getDefaultModelFor( $this->currentTitle ); + $out .= " " . Xml::element( 'sha1', null, strval( $rev->getSha1() ) ) . "\n"; } - $content_handler = ContentHandler::getForModelID( $content_model ); + // Avoid PHP 7.1 warning from passing $this by reference + $writer = $this; + $text = $rev->getContent( SlotRecord::MAIN, RevisionRecord::RAW ); + Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text, $rev ] ); - // TODO: rev_content_format no longer exists with MCR, see T174031 - if ( isset( $row->rev_content_format ) && !is_null( $row->rev_content_format ) ) { - $content_format = strval( $row->rev_content_format ); - } else { - // probably using $wgContentHandlerUseDB = false; - $content_format = $content_handler->getDefaultFormat(); + $out .= " \n"; + + return $out; + } + + /** + * @param SlotRecord $slot + * @param int $contentMode see the WRITE_XXX constants + * + * @return string + */ + private function writeSlot( SlotRecord $slot, $contentMode ) { + $isMain = $slot->getRole() === SlotRecord::MAIN; + $isV11 = $this->schemaVersion >= XML_DUMP_SCHEMA_VERSION_11; + + if ( !$isV11 && !$isMain ) { + // ignore extra slots + return ''; } - $out .= " " . Xml::element( 'model', null, strval( $content_model ) ) . "\n"; - $out .= " " . Xml::element( 'format', null, strval( $content_format ) ) . "\n"; + $out = ''; + $indent = ' '; - $text = ''; - if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_TEXT ) ) { - $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n"; - } elseif ( isset( $row->old_text ) ) { - // Raw text from the database may have invalid chars - $text = strval( Revision::getRevisionText( $row ) ); - try { - $text = $content_handler->exportTransform( $text, $content_format ); - } - catch ( Exception $ex ) { - if ( $ex instanceof MWException || $ex instanceof RuntimeException ) { - // leave text as is; that's the way it goes - wfLogWarning( 'exportTransform failed on text for revid ' . $row->rev_id . "\n" ); - } else { - throw $ex; - } - } - $out .= " " . Xml::elementClean( 'text', - [ 'xml:space' => 'preserve', 'bytes' => intval( $row->rev_len ) ], - strval( $text ) ) . "\n"; - } elseif ( isset( $row->_load_content ) ) { - // TODO: make this fully MCR aware, see T174031 - $slot = $rev->getSlot( 'main' ); - try { - $content = $slot->getContent(); + if ( !$isMain ) { + // non-main slots are wrapped into an additional element. + $out .= ' ' . Xml::openElement( 'content' ) . "\n"; + $indent .= ' '; + $out .= $indent . Xml::element( 'role', null, strval( $slot->getRole() ) ) . "\n"; + } - if ( $content instanceof TextContent ) { - // HACK: For text based models, bypass the serialization step. - // This allows extensions (like Flow)that use incompatible combinations - // of serialization format and content model. - $text = $content->getNativeData(); - } else { - $text = $content->serialize( $content_format ); - } - $text = $content_handler->exportTransform( $text, $content_format ); - $out .= " " . Xml::elementClean( 'text', - [ 'xml:space' => 'preserve', 'bytes' => intval( $slot->getSize() ) ], - strval( $text ) ) . "\n"; + if ( $isV11 ) { + $out .= $indent . Xml::element( 'origin', null, strval( $slot->getOrigin() ) ) . "\n"; + } + + $contentModel = $slot->getModel(); + $contentHandler = ContentHandler::getForModelID( $contentModel ); + $contentFormat = $contentHandler->getDefaultFormat(); + + // XXX: The content format is only relevant when actually outputting serialized content. + // It should probably be an attribute on the text tag. + $out .= $indent . Xml::element( 'model', null, strval( $contentModel ) ) . "\n"; + $out .= $indent . Xml::element( 'format', null, strval( $contentFormat ) ) . "\n"; + + $textAttributes = [ + 'xml:space' => 'preserve', + 'bytes' => $slot->getSize(), + ]; + + if ( $isV11 ) { + $textAttributes['sha1'] = $slot->getSha1(); + } + + if ( $contentMode === self::WRITE_CONTENT ) { + try { + // write tag + $out .= $this->writeText( $slot->getContent(), $textAttributes, $indent ); + } catch ( SuppressedDataException $ex ) { + // NOTE: this shouldn't happen, since the caller is supposed to have checked + // for suppressed content! + // write placeholder tag + $textAttributes['deleted'] = 'deleted'; + $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n"; } catch ( Exception $ex ) { if ( $ex instanceof MWException || $ex instanceof RuntimeException ) { - // there's no provsion in the schema for an attribute that will let + // there's no provision in the schema for an attribute that will let // the user know this element was unavailable due to error; an empty // tag is the best we can do - $out .= " " . Xml::element( 'text' ) . "\n"; - wfLogWarning( 'failed to load content for revid ' . $row->rev_id . "\n" ); + $out .= $indent . Xml::element( 'text' ) . "\n"; + wfLogWarning( + 'failed to load content slot ' . $slot->getRole() . ' for revision ' + . $slot->getRevision() . "\n" + ); } else { throw $ex; } } - } elseif ( isset( $row->rev_text_id ) ) { - // Stub output for pre-MCR schema - // TODO: MCR: rev_text_id only exists in the pre-MCR schema. Remove this when - // we drop support for the old schema. - $out .= " " . Xml::element( 'text', - [ 'id' => $row->rev_text_id, 'bytes' => intval( $row->rev_len ) ], - "" ) . "\n"; + } elseif ( $contentMode === self::WRITE_STUB_DELETED ) { + // write placeholder tag + $textAttributes['deleted'] = 'deleted'; + $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n"; } else { - // Backwards-compatible stub output for MCR aware schema - // TODO: MCR: emit content addresses instead of text ids, see T174031, T199121 - $slot = $rev->getSlot( 'main' ); + // write stub tag + if ( $isV11 ) { + $textAttributes['location'] = $slot->getAddress(); + } + // Output the numerical text ID if possible, for backwards compatibility. // Note that this is currently the ONLY reason we have a BlobStore here at all. // When removing this line, check whether the BlobStore has become unused. $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() ); - $out .= " " . Xml::element( 'text', - [ 'id' => $textId, 'bytes' => intval( $slot->getSize() ) ], - "" ) . "\n"; + if ( $textId ) { + $textAttributes['id'] = $textId; + } elseif ( !$isV11 ) { + throw new InvalidArgumentException( + 'Cannot produce stubs for non-text-table content blobs with schema version ' + . $this->schemaVersion + ); + } + + $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n"; } - if ( isset( $row->rev_sha1 ) - && $row->rev_sha1 - && !( $row->rev_deleted & Revision::DELETED_TEXT ) - ) { - $out .= " " . Xml::element( 'sha1', null, strval( $row->rev_sha1 ) ) . "\n"; - } else { - $out .= " \n"; + if ( !$isMain ) { + $out .= ' ' . Xml::closeElement( 'content' ) . "\n"; } - // Avoid PHP 7.1 warning from passing $this by reference - $writer = $this; - Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text ] ); + return $out; + } - $out .= " \n"; + /** + * @param Content $content + * @param string[] $textAttributes + * @param string $indent + * + * @return string + */ + private function writeText( Content $content, $textAttributes, $indent ) { + $out = ''; + + $contentHandler = $content->getContentHandler(); + $contentFormat = $contentHandler->getDefaultFormat(); + + if ( $content instanceof TextContent ) { + // HACK: For text based models, bypass the serialization step. This allows extensions (like Flow) + // that use incompatible combinations of serialization format and content model. + $data = $content->getNativeData(); + } else { + $data = $content->serialize( $contentFormat ); + } + + $data = $contentHandler->exportTransform( $data, $contentFormat ); + $textAttributes['bytes'] = $size = strlen( $data ); // make sure to use the actual size + $out .= $indent . Xml::elementClean( 'text', $textAttributes, strval( $data ) ) . "\n"; return $out; } diff --git a/maintenance/includes/TextPassDumper.php b/maintenance/includes/TextPassDumper.php index eaed7ed2fa..b37fec188e 100644 --- a/maintenance/includes/TextPassDumper.php +++ b/maintenance/includes/TextPassDumper.php @@ -281,7 +281,7 @@ TEXT $this->finalOptionCheck(); // we only want this so we know how to close a stream :-P - $this->xmlwriterobj = new XmlDumpWriter(); + $this->xmlwriterobj = new XmlDumpWriter( XmlDumpWriter::WRITE_CONTENT, $this->schemaVersion ); $input = fopen( $this->input, "rt" ); $this->readDump( $input ); diff --git a/tests/phpunit/maintenance/DumpAsserter.php b/tests/phpunit/maintenance/DumpAsserter.php index ad33f6e154..e8c1cd6090 100644 --- a/tests/phpunit/maintenance/DumpAsserter.php +++ b/tests/phpunit/maintenance/DumpAsserter.php @@ -137,6 +137,34 @@ class DumpAsserter { } } + /** + * Asserts that the xml reader is at an element of given name, and that element + * is an empty tag. + * + * @param string $name The name of the element to check for + * (e.g.: "text" for ) + * @param bool $skip (optional) if true, skip past the found element + * @param bool $skip_ws (optional) if true, also skip past white spaces that trail the + * closing element. + */ + public function assertEmptyNode( $name, $skip = true, $skip_ws = true ) { + $this->assertNodeStart( $name, false ); + Assert::assertFalse( $this->xml->hasValue, "$name tag has content" ); + + if ( $skip ) { + Assert::assertTrue( $this->xml->read(), "Skipping $name tag" ); + if ( ( $this->xml->nodeType == XMLReader::END_ELEMENT ) + && ( $this->xml->name == $name ) + ) { + $this->xml->read(); + } + + if ( $skip_ws ) { + $this->skipWhitespace(); + } + } + } + /** * Asserts that the xml reader is at an closing element of given name, and optionally * skips past it. @@ -246,6 +274,11 @@ class DumpAsserter { $this->assertTextNode( "comment", $summary ); $this->skipWhitespace(); + if ( $this->schemaVersion >= XML_DUMP_SCHEMA_VERSION_11 ) { + $this->assertTextNode( "origin", false ); + $this->skipWhitespace(); + } + $this->assertTextNode( "model", $model ); $this->skipWhitespace(); @@ -258,9 +291,16 @@ class DumpAsserter { $this->assertText( $id, $text_id, $text_bytes, $text ); } else { $text_found = false; + if ( $this->schemaVersion >= XML_DUMP_SCHEMA_VERSION_11 ) { + Assert::fail( 'Missing text node' ); + } } - $this->assertTextNode( "sha1", $text_sha1 ); + if ( $text_sha1 ) { + $this->assertTextNode( "sha1", $text_sha1 ); + } else { + $this->assertEmptyNode( "sha1" ); + } if ( !$text_found ) { $this->assertText( $id, $text_id, $text_bytes, $text ); @@ -278,17 +318,9 @@ class DumpAsserter { } if ( $text === false ) { - // Testing for a stub Assert::assertEquals( $this->xml->getAttribute( "id" ), $text_id, "Text id of revision " . $id ); - Assert::assertFalse( $this->xml->hasValue, "Revision has text" ); - Assert::assertTrue( $this->xml->read(), "Skipping text start tag" ); - if ( ( $this->xml->nodeType == XMLReader::END_ELEMENT ) - && ( $this->xml->name == "text" ) - ) { - $this->xml->read(); - } - $this->skipWhitespace(); + $this->assertEmptyNode( "text" ); } else { // Testing for a real dump Assert::assertTrue( $this->xml->read(), "Skipping text start tag" ); diff --git a/tests/phpunit/maintenance/backup_PageTest.php b/tests/phpunit/maintenance/backup_PageTest.php index 17c8757b3c..7a78e524a5 100644 --- a/tests/phpunit/maintenance/backup_PageTest.php +++ b/tests/phpunit/maintenance/backup_PageTest.php @@ -5,8 +5,11 @@ namespace MediaWiki\Tests\Maintenance; use DumpBackup; use Exception; use MediaWiki\MediaWikiServices; +use MediaWiki\Revision\RevisionRecord; use MediaWikiTestCase; use MWException; +use RequestContext; +use RevisionDeleter; use Title; use WikiExporter; use Wikimedia\Rdbms\IDatabase; @@ -77,6 +80,17 @@ class BackupDumperPageTest extends DumpTestCase { "BackupDumperTestP2Summary4 extra " ); $this->pageId2 = $page->getId(); + $revDel = RevisionDeleter::createList( + 'revision', + RequestContext::getMain(), + $this->pageTitle2, + [ $this->revId2_2 ] + ); + $revDel->setVisibility( [ + 'value' => [ RevisionRecord::DELETED_TEXT => 1 ], + 'comment' => 'testing!' + ] ); + $this->pageTitle3 = Title::newFromText( 'BackupDumperTestP3', $this->namespace ); $page = WikiPage::factory( $this->pageTitle3 ); list( $this->revId3_1, $this->textId3_1 ) = $this->addRevision( $page, @@ -232,10 +246,10 @@ class BackupDumperPageTest extends DumpTestCase { $asserter->assertRevision( $this->revId2_2, "BackupDumperTestP2Summary2", - $this->textId2_2, - 23, - "b7vj5ks32po5m1z1t1br4o7scdwwy95", - "BackupDumperTestP2Text2", + null, // deleted! + false, // deleted! + null, // deleted! + false, // deleted! $this->revId2_1 ); $asserter->assertRevision( @@ -346,10 +360,10 @@ class BackupDumperPageTest extends DumpTestCase { $asserter->assertRevision( $this->revId2_2, "BackupDumperTestP2Summary2", - $this->textId2_2, - 23, - "b7vj5ks32po5m1z1t1br4o7scdwwy95", - false, + null, // deleted! + false, // deleted! + null, // deleted! + false, // deleted! $this->revId2_1 ); $asserter->assertRevision( @@ -622,10 +636,10 @@ class BackupDumperPageTest extends DumpTestCase { $asserter->assertRevision( $this->revId2_2, "BackupDumperTestP2Summary2", - $this->textId2_2, - 23, - "b7vj5ks32po5m1z1t1br4o7scdwwy95", - false, + null, // deleted! + false, // deleted! + null, // deleted! + false, // deleted! $this->revId2_1 ); $asserter->assertRevision(