3 * Service for storing and loading data blobs representing revision content.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
20 * Attribution notice: when this file was created, much of its content was taken
21 * from the Revision.php file as present in release 1.30. Refer to the history
22 * of that file for original authorship.
27 namespace MediaWiki\Storage
;
30 use DBAccessObjectUtils
;
33 use InvalidArgumentException
;
37 use ExternalStoreAccess
;
38 use Wikimedia\Assert\Assert
;
39 use Wikimedia\AtEase\AtEase
;
40 use Wikimedia\Rdbms\IDatabase
;
41 use Wikimedia\Rdbms\ILoadBalancer
;
44 * Service for storing and loading Content objects.
48 * @note This was written to act as a drop-in replacement for the corresponding
49 * static methods in Revision.
51 class SqlBlobStore
implements IDBAccessObject
, BlobStore
{
53 // Note: the name has been taken unchanged from the Revision class.
54 const TEXT_CACHE_GROUP
= 'revisiontext:10';
59 private $dbLoadBalancer;
62 * @var ExternalStoreAccess
64 private $extStoreAccess;
72 * @var string|bool DB domain ID of a wiki or false for the local one
79 private $cacheExpiry = 604800; // 7 days
84 private $compressBlobs = false;
89 private $legacyEncoding = false;
94 private $useExternalStore = false;
97 * @param ILoadBalancer $dbLoadBalancer A load balancer for acquiring database connections
98 * @param ExternalStoreAccess $extStoreAccess Access layer for external storage
99 * @param WANObjectCache $cache A cache manager for caching blobs. This can be the local
100 * wiki's default instance even if $dbDomain refers to a different wiki, since
101 * makeGlobalKey() is used to construct a key that allows cached blobs from the
102 * same database to be re-used between wikis. For example, wiki A and wiki B will
103 * use the same cache keys for blobs fetched from wiki C, regardless of the
104 * wiki-specific default key space.
105 * @param bool|string $dbDomain The ID of the target wiki database. Use false for the local wiki.
107 public function __construct(
108 ILoadBalancer
$dbLoadBalancer,
109 ExternalStoreAccess
$extStoreAccess,
110 WANObjectCache
$cache,
113 $this->dbLoadBalancer
= $dbLoadBalancer;
114 $this->extStoreAccess
= $extStoreAccess;
115 $this->cache
= $cache;
116 $this->dbDomain
= $dbDomain;
120 * @return int time for which blobs can be cached, in seconds
122 public function getCacheExpiry() {
123 return $this->cacheExpiry
;
127 * @param int $cacheExpiry time for which blobs can be cached, in seconds
129 public function setCacheExpiry( $cacheExpiry ) {
130 Assert
::parameterType( 'integer', $cacheExpiry, '$cacheExpiry' );
132 $this->cacheExpiry
= $cacheExpiry;
136 * @return bool whether blobs should be compressed for storage
138 public function getCompressBlobs() {
139 return $this->compressBlobs
;
143 * @param bool $compressBlobs whether blobs should be compressed for storage
145 public function setCompressBlobs( $compressBlobs ) {
146 $this->compressBlobs
= $compressBlobs;
150 * @return false|string The legacy encoding to assume for blobs that are not marked as utf8.
151 * False means handling of legacy encoding is disabled, and utf8 assumed.
153 public function getLegacyEncoding() {
154 return $this->legacyEncoding
;
158 * @deprecated since 1.34 No longer needed
161 public function getLegacyEncodingConversionLang() {
162 wfDeprecated( __METHOD__
);
167 * Set the legacy encoding to assume for blobs that do not have the utf-8 flag set.
169 * @note The second parameter, Language $language, was removed in 1.34.
171 * @param string $legacyEncoding The legacy encoding to assume for blobs that are
172 * not marked as utf8.
174 public function setLegacyEncoding( $legacyEncoding ) {
175 Assert
::parameterType( 'string', $legacyEncoding, '$legacyEncoding' );
177 $this->legacyEncoding
= $legacyEncoding;
181 * @return bool Whether to use the ExternalStore mechanism for storing blobs.
183 public function getUseExternalStore() {
184 return $this->useExternalStore
;
188 * @param bool $useExternalStore Whether to use the ExternalStore mechanism for storing blobs.
190 public function setUseExternalStore( $useExternalStore ) {
191 Assert
::parameterType( 'boolean', $useExternalStore, '$useExternalStore' );
193 $this->useExternalStore
= $useExternalStore;
197 * @return ILoadBalancer
199 private function getDBLoadBalancer() {
200 return $this->dbLoadBalancer
;
204 * @param int $index A database index, like DB_MASTER or DB_REPLICA
208 private function getDBConnection( $index ) {
209 $lb = $this->getDBLoadBalancer();
210 return $lb->getConnectionRef( $index, [], $this->dbDomain
);
214 * Stores an arbitrary blob of data and returns an address that can be used with
215 * getBlob() to retrieve the same blob of data,
217 * @param string $data
218 * @param array $hints An array of hints.
220 * @throws BlobAccessException
221 * @return string an address that can be used with getBlob() to retrieve the data.
223 public function storeBlob( $data, $hints = [] ) {
225 $flags = $this->compressData( $data );
227 # Write to external storage if required
228 if ( $this->useExternalStore
) {
229 // Store and get the URL
230 $data = $this->extStoreAccess
->insert( $data, [ 'domain' => $this->dbDomain
] );
232 throw new BlobAccessException( "Failed to store text to external storage" );
237 $flags .= 'external';
239 // TODO: we could also return an address for the external store directly here.
240 // That would mean bypassing the text table entirely when the external store is
241 // used. We'll need to assess expected fallout before doing that.
244 $dbw = $this->getDBConnection( DB_MASTER
);
246 $old_id = $dbw->nextSequenceValue( 'text_old_id_seq' );
252 'old_flags' => $flags,
257 $textId = $dbw->insertId();
259 return self
::makeAddressFromTextId( $textId );
260 } catch ( MWException
$e ) {
261 throw new BlobAccessException( $e->getMessage(), 0, $e );
266 * Retrieve a blob, given an address.
267 * Currently hardcoded to the 'text' table storage engine.
269 * MCR migration note: this replaces Revision::loadText
271 * @param string $blobAddress
272 * @param int $queryFlags
274 * @throws BlobAccessException
277 public function getBlob( $blobAddress, $queryFlags = 0 ) {
278 Assert
::parameterType( 'string', $blobAddress, '$blobAddress' );
281 $blob = $this->cache
->getWithSetCallback(
282 $this->getCacheKey( $blobAddress ),
283 $this->getCacheTTL(),
284 function ( $unused, &$ttl, &$setOpts ) use ( $blobAddress, $queryFlags, &$error ) {
285 // Ignore $setOpts; blobs are immutable and negatives are not cached
286 list( $result, $errors ) = $this->fetchBlobs( [ $blobAddress ], $queryFlags );
287 // No negative caching; negative hits on text rows may be due to corrupted replica DBs
288 $error = $errors[$blobAddress] ??
null;
289 return $result[$blobAddress];
291 [ 'pcGroup' => self
::TEXT_CACHE_GROUP
, 'pcTTL' => IExpiringStore
::TTL_PROC_LONG
]
295 throw new BlobAccessException( $error );
298 Assert
::postcondition( is_string( $blob ), 'Blob must not be null' );
303 * A batched version of BlobStore::getBlob.
305 * @param string[] $blobAddresses An array of blob addresses.
306 * @param int $queryFlags See IDBAccessObject.
307 * @throws BlobAccessException
308 * @return StatusValue A status with a map of blobAddress => binary blob data or null
309 * if fetching the blob has failed. Fetch failures errors are the
310 * warnings in the status object.
313 public function getBlobBatch( $blobAddresses, $queryFlags = 0 ) {
315 $addressByCacheKey = $this->cache
->makeMultiKeys(
317 function ( $blobAddress ) {
318 return $this->getCacheKey( $blobAddress );
321 $blobsByCacheKey = $this->cache
->getMultiWithUnionSetCallback(
323 $this->getCacheTTL(),
324 function ( array $blobAddresses, array &$ttls, array &$setOpts ) use ( $queryFlags, &$errors ) {
325 // Ignore $setOpts; blobs are immutable and negatives are not cached
326 list( $result, $errors ) = $this->fetchBlobs( $blobAddresses, $queryFlags );
329 [ 'pcGroup' => self
::TEXT_CACHE_GROUP
, 'pcTTL' => IExpiringStore
::TTL_PROC_LONG
]
332 // Remap back to incoming blob addresses. The return value of the
333 // WANObjectCache::getMultiWithUnionSetCallback is keyed on the internal
334 // keys from WANObjectCache::makeMultiKeys, so we need to remap them
335 // before returning to the client.
336 $blobsByAddress = [];
337 foreach ( $blobsByCacheKey as $cacheKey => $blob ) {
338 $blobsByAddress[ $addressByCacheKey[ $cacheKey ] ] = $blob !== false ?
$blob : null;
341 $result = StatusValue
::newGood( $blobsByAddress );
343 foreach ( $errors as $error ) {
344 $result->warning( 'internalerror', $error );
351 * MCR migration note: this corresponds to Revision::fetchText
353 * @param string[] $blobAddresses
354 * @param int $queryFlags
356 * @throws BlobAccessException
357 * @return array [ $result, $errors ] A map of blob addresses to successfully fetched blobs
358 * or false if fetch failed, plus and array of errors
360 private function fetchBlobs( $blobAddresses, $queryFlags ) {
361 $textIdToBlobAddress = [];
364 foreach ( $blobAddresses as $blobAddress ) {
365 list( $schema, $id ) = self
::splitBlobAddress( $blobAddress );
366 //TODO: MCR: also support 'ex' schema with ExternalStore URLs, plus flags encoded in the URL!
367 if ( $schema === 'tt' ) {
368 $textId = intval( $id );
369 $textIdToBlobAddress[$textId] = $blobAddress;
371 $errors[$blobAddress] = "Unknown blob address schema: $schema";
372 $result[$blobAddress] = false;
376 if ( !$textId ||
$id !== (string)$textId ) {
377 $errors[$blobAddress] = "Bad blob address: $blobAddress";
378 $result[$blobAddress] = false;
382 $textIds = array_keys( $textIdToBlobAddress );
384 return [ $result, $errors ];
386 // Callers doing updates will pass in READ_LATEST as usual. Since the text/blob tables
387 // do not normally get rows changed around, set READ_LATEST_IMMUTABLE in those cases.
388 $queryFlags |
= DBAccessObjectUtils
::hasFlags( $queryFlags, self
::READ_LATEST
)
389 ? self
::READ_LATEST_IMMUTABLE
391 list( $index, $options, $fallbackIndex, $fallbackOptions ) =
392 DBAccessObjectUtils
::getDBOptions( $queryFlags );
393 // Text data is immutable; check replica DBs first.
394 $dbConnection = $this->getDBConnection( $index );
395 $rows = $dbConnection->select(
397 [ 'old_id', 'old_text', 'old_flags' ],
398 [ 'old_id' => $textIds ],
403 // Fallback to DB_MASTER in some cases if not all the rows were found, using the appropriate
404 // options, such as FOR UPDATE to avoid missing rows due to REPEATABLE-READ.
405 if ( $dbConnection->numRows( $rows ) !== count( $textIds ) && $fallbackIndex !== null ) {
406 $fetchedTextIds = [];
407 foreach ( $rows as $row ) {
408 $fetchedTextIds[] = $row->old_id
;
410 $missingTextIds = array_diff( $textIds, $fetchedTextIds );
411 $dbConnection = $this->getDBConnection( $fallbackIndex );
412 $rowsFromFallback = $dbConnection->select(
414 [ 'old_id', 'old_text', 'old_flags' ],
415 [ 'old_id' => $missingTextIds ],
419 $appendIterator = new AppendIterator();
420 $appendIterator->append( $rows );
421 $appendIterator->append( $rowsFromFallback );
422 $rows = $appendIterator;
425 foreach ( $rows as $row ) {
426 $blobAddress = $textIdToBlobAddress[$row->old_id
];
427 $blob = $this->expandBlob( $row->old_text
, $row->old_flags
, $blobAddress );
428 if ( $blob === false ) {
429 $errors[$blobAddress] = "Bad data in text row {$row->old_id}.";
431 $result[$blobAddress] = $blob;
434 // If we're still missing some of the rows, set errors for missing blobs.
435 if ( count( $result ) !== count( $blobAddresses ) ) {
436 foreach ( $blobAddresses as $blobAddress ) {
437 if ( !isset( $result[$blobAddress ] ) ) {
438 $errors[$blobAddress] = "Unable to fetch blob at $blobAddress";
439 $result[$blobAddress] = false;
443 return [ $result, $errors ];
447 * Get a cache key for a given Blob address.
449 * The cache key is constructed in a way that allows cached blobs from the same database
450 * to be re-used between wikis. For example, wiki A and wiki B will use the same cache keys
451 * for blobs fetched from wiki C.
453 * @param string $blobAddress
456 private function getCacheKey( $blobAddress ) {
457 return $this->cache
->makeGlobalKey(
459 $this->dbLoadBalancer
->resolveDomainID( $this->dbDomain
),
465 * Expand a raw data blob according to the flags given.
467 * MCR migration note: this replaces Revision::getRevisionText
469 * @note direct use is deprecated, use getBlob() or SlotRecord::getContent() instead.
470 * @todo make this private, there should be no need to use this method outside this class.
472 * @param string $raw The raw blob data, to be processed according to $flags.
473 * May be the blob itself, or the blob compressed, or just the address
474 * of the actual blob, depending on $flags.
475 * @param string|string[] $flags Blob flags, such as 'external' or 'gzip'.
476 * Note that not including 'utf-8' in $flags will cause the data to be decoded
477 * according to the legacy encoding specified via setLegacyEncoding.
478 * @param string|null $cacheKey A blob address for use in the cache key. If not given,
479 * caching is disabled.
481 * @return false|string The expanded blob or false on failure
483 public function expandBlob( $raw, $flags, $cacheKey = null ) {
484 if ( is_string( $flags ) ) {
485 $flags = explode( ',', $flags );
488 // Use external methods for external objects, text in table is URL-only then
489 if ( in_array( 'external', $flags ) ) {
491 $parts = explode( '://', $url, 2 );
492 if ( count( $parts ) == 1 ||
$parts[1] == '' ) {
497 // The cached value should be decompressed, so handle that and return here.
498 return $this->cache
->getWithSetCallback(
499 $this->getCacheKey( $cacheKey ),
500 $this->getCacheTTL(),
501 function () use ( $url, $flags ) {
502 // Ignore $setOpts; blobs are immutable and negatives are not cached
503 $blob = $this->extStoreAccess
504 ->fetchFromURL( $url, [ 'domain' => $this->dbDomain
] );
506 return $blob === false ?
false : $this->decompressData( $blob, $flags );
508 [ 'pcGroup' => self
::TEXT_CACHE_GROUP
, 'pcTTL' => WANObjectCache
::TTL_PROC_LONG
]
511 $blob = $this->extStoreAccess
->fetchFromURL( $url, [ 'domain' => $this->dbDomain
] );
512 return $blob === false ?
false : $this->decompressData( $blob, $flags );
515 return $this->decompressData( $raw, $flags );
520 * If $wgCompressRevisions is enabled, we will compress data.
521 * The input string is modified in place.
522 * Return value is the flags field: contains 'gzip' if the
523 * data is compressed, and 'utf-8' if we're saving in UTF-8
526 * MCR migration note: this replaces Revision::compressRevisionText
528 * @note direct use is deprecated!
529 * @todo make this private, there should be no need to use this method outside this class.
531 * @param mixed &$blob Reference to a text
535 public function compressData( &$blob ) {
538 // Revisions not marked as UTF-8 will have legacy decoding applied by decompressData().
539 // XXX: if $this->legacyEncoding is not set, we could skip this. That would however be
540 // risky, since $this->legacyEncoding being set in the future would lead to data corruption.
541 $blobFlags[] = 'utf-8';
543 if ( $this->compressBlobs
) {
544 if ( function_exists( 'gzdeflate' ) ) {
545 $deflated = gzdeflate( $blob );
547 if ( $deflated === false ) {
548 wfLogWarning( __METHOD__
. ': gzdeflate() failed' );
551 $blobFlags[] = 'gzip';
554 wfDebug( __METHOD__
. " -- no zlib support, not compressing\n" );
557 return implode( ',', $blobFlags );
561 * Re-converts revision text according to its flags.
563 * MCR migration note: this replaces Revision::decompressRevisionText
565 * @note direct use is deprecated, use getBlob() or SlotRecord::getContent() instead.
566 * @todo make this private, there should be no need to use this method outside this class.
568 * @param string $blob Blob in compressed/encoded form.
569 * @param array $blobFlags Compression flags, such as 'gzip'.
570 * Note that not including 'utf-8' in $blobFlags will cause the data to be decoded
571 * according to the legacy encoding specified via setLegacyEncoding.
573 * @return string|bool Decompressed text, or false on failure
575 public function decompressData( $blob, array $blobFlags ) {
576 // Revision::decompressRevisionText accepted false here, so defend against that
577 Assert
::parameterType( 'string', $blob, '$blob' );
579 if ( in_array( 'error', $blobFlags ) ) {
580 // Error row, return false
584 if ( in_array( 'gzip', $blobFlags ) ) {
585 # Deal with optional compression of archived pages.
586 # This can be done periodically via maintenance/compressOld.php, and
587 # as pages are saved if $wgCompressRevisions is set.
588 $blob = gzinflate( $blob );
590 if ( $blob === false ) {
591 wfWarn( __METHOD__
. ': gzinflate() failed' );
596 if ( in_array( 'object', $blobFlags ) ) {
597 # Generic compressed storage
598 $obj = unserialize( $blob );
599 if ( !is_object( $obj ) ) {
603 $blob = $obj->getText();
606 // Needed to support old revisions left over from from the 1.4 / 1.5 migration.
607 if ( $blob !== false && $this->legacyEncoding
608 && !in_array( 'utf-8', $blobFlags ) && !in_array( 'utf8', $blobFlags )
610 # Old revisions kept around in a legacy encoding?
611 # Upconvert on demand.
612 # ("utf8" checked for compatibility with some broken
613 # conversion scripts 2008-12-30)
614 # Even with //IGNORE iconv can whine about illegal characters in
615 # *input* string. We just ignore those too.
616 # REF: https://bugs.php.net/bug.php?id=37166
617 # REF: https://phabricator.wikimedia.org/T18885
618 AtEase
::suppressWarnings();
619 $blob = iconv( $this->legacyEncoding
, 'UTF-8//IGNORE', $blob );
620 AtEase
::restoreWarnings();
627 * Get the text cache TTL
629 * MCR migration note: this replaces Revision::getCacheTTL
633 private function getCacheTTL() {
634 if ( $this->cache
->getQoS( WANObjectCache
::ATTR_EMULATION
)
635 <= WANObjectCache
::QOS_EMULATION_SQL
637 // Do not cache RDBMs blobs in...the RDBMs store
638 $ttl = WANObjectCache
::TTL_UNCACHEABLE
;
640 $ttl = $this->cacheExpiry ?
: WANObjectCache
::TTL_UNCACHEABLE
;
647 * Returns an ID corresponding to the old_id field in the text table, corresponding
648 * to the given $address.
650 * Currently, $address must start with 'tt:' followed by a decimal integer representing
651 * the old_id; if $address does not start with 'tt:', null is returned. However,
652 * the implementation may change to insert rows into the text table on the fly.
653 * This implies that this method cannot be static.
655 * @note This method exists for use with the text table based storage schema.
656 * It should not be assumed that is will function with all future kinds of content addresses.
658 * @deprecated since 1.31, so don't assume that all blob addresses refer to a row in the text
659 * table. This method should become private once the relevant refactoring in WikiPage is
662 * @param string $address
666 public function getTextIdFromAddress( $address ) {
667 list( $schema, $id, ) = self
::splitBlobAddress( $address );
669 if ( $schema !== 'tt' ) {
673 $textId = intval( $id );
675 if ( !$textId ||
$id !== (string)$textId ) {
676 throw new InvalidArgumentException( "Malformed text_id: $id" );
683 * Returns an address referring to content stored in the text table row with the given ID.
684 * The address schema for blobs stored in the text table is "tt:" followed by an integer
685 * that corresponds to a value of the old_id field.
687 * @deprecated since 1.31. This method should become private once the relevant refactoring
688 * in WikiPage is complete.
694 public static function makeAddressFromTextId( $id ) {
699 * Splits a blob address into three parts: the schema, the ID, and parameters/flags.
703 * @param string $address
705 * @throws InvalidArgumentException
706 * @return array [ $schema, $id, $parameters ], with $parameters being an assoc array.
708 public static function splitBlobAddress( $address ) {
709 if ( !preg_match( '/^(\w+):(\w+)(\?(.*))?$/', $address, $m ) ) {
710 throw new InvalidArgumentException( "Bad blob address: $address" );
713 $schema = strtolower( $m[1] );
715 $parameters = isset( $m[4] ) ?
wfCgiToArray( $m[4] ) : [];
717 return [ $schema, $id, $parameters ];
720 public function isReadOnly() {
721 if ( $this->useExternalStore
&& $this->extStoreAccess
->isReadOnly() ) {
725 return ( $this->getDBLoadBalancer()->getReadOnlyReason() !== false );