From: Umherirrender Date: Mon, 25 Mar 2019 19:29:47 +0000 (+0100) Subject: Move HistoryBlob classes to own files X-Git-Tag: 1.34.0-rc.0~2288^2 X-Git-Url: http://git.cyclocoop.org/%7B%7B%20url_for%28%27admin_vote_del%27%2C%20idvote=vote.voteid%29%20%7D%7D?a=commitdiff_plain;h=7ede3c619a96a409af12a9cc4dd542e252813917;p=lhc%2Fweb%2Fwiklou.git Move HistoryBlob classes to own files Move all into own subfolder Change-Id: Iccf1bb9a2c8927c1b4dc0952d59d745109a71f76 --- diff --git a/.phpcs.xml b/.phpcs.xml index d1e54a706c..ce0eac42a3 100644 --- a/.phpcs.xml +++ b/.phpcs.xml @@ -215,7 +215,6 @@ */includes/Feed\.php */includes/filerepo/file/LocalFile\.php */includes/gallery/PackedOverlayImageGallery\.php - */includes/HistoryBlob\.php */includes/htmlform/HTMLFormElement\.php */includes/libs/filebackend/FileBackendStore\.php */includes/libs/filebackend/FSFileBackend\.php diff --git a/autoload.php b/autoload.php index 94a99faaa7..5f031d03ad 100644 --- a/autoload.php +++ b/autoload.php @@ -298,7 +298,7 @@ $wgAutoloadLocalClasses = [ 'ComposerVendorHtaccessCreator' => __DIR__ . '/includes/composer/ComposerVendorHtaccessCreator.php', 'ComposerVersionNormalizer' => __DIR__ . '/includes/composer/ComposerVersionNormalizer.php', 'CompressOld' => __DIR__ . '/maintenance/storage/compressOld.php', - 'ConcatenatedGzipHistoryBlob' => __DIR__ . '/includes/HistoryBlob.php', + 'ConcatenatedGzipHistoryBlob' => __DIR__ . '/includes/historyblob/ConcatenatedGzipHistoryBlob.php', 'Config' => __DIR__ . '/includes/config/Config.php', 'ConfigException' => __DIR__ . '/includes/config/ConfigException.php', 'ConfigFactory' => __DIR__ . '/includes/config/ConfigFactory.php', @@ -398,7 +398,7 @@ $wgAutoloadLocalClasses = [ 'Diff' => __DIR__ . '/includes/diff/DairikiDiff.php', 'DiffEngine' => __DIR__ . '/includes/diff/DiffEngine.php', 'DiffFormatter' => __DIR__ . '/includes/diff/DiffFormatter.php', - 'DiffHistoryBlob' => __DIR__ . '/includes/HistoryBlob.php', + 'DiffHistoryBlob' => __DIR__ . '/includes/historyblob/DiffHistoryBlob.php', 'DiffOp' => __DIR__ . '/includes/diff/DairikiDiff.php', 'DiffOpAdd' => __DIR__ . '/includes/diff/DairikiDiff.php', 'DiffOpChange' => __DIR__ . '/includes/diff/DairikiDiff.php', @@ -628,9 +628,9 @@ $wgAutoloadLocalClasses = [ 'HashSiteStore' => __DIR__ . '/includes/site/HashSiteStore.php', 'HashtableReplacer' => __DIR__ . '/includes/libs/replacers/HashtableReplacer.php', 'HistoryAction' => __DIR__ . '/includes/actions/HistoryAction.php', - 'HistoryBlob' => __DIR__ . '/includes/HistoryBlob.php', - 'HistoryBlobCurStub' => __DIR__ . '/includes/HistoryBlob.php', - 'HistoryBlobStub' => __DIR__ . '/includes/HistoryBlob.php', + 'HistoryBlob' => __DIR__ . '/includes/historyblob/HistoryBlob.php', + 'HistoryBlobCurStub' => __DIR__ . '/includes/historyblob/HistoryBlobCurStub.php', + 'HistoryBlobStub' => __DIR__ . '/includes/historyblob/HistoryBlobStub.php', 'HistoryPager' => __DIR__ . '/includes/actions/pagers/HistoryPager.php', 'Hooks' => __DIR__ . '/includes/Hooks.php', 'Html' => __DIR__ . '/includes/Html.php', @@ -1707,8 +1707,8 @@ $wgAutoloadLocalClasses = [ 'ZhConverter' => __DIR__ . '/languages/classes/LanguageZh.php', 'ZipDirectoryReader' => __DIR__ . '/includes/utils/ZipDirectoryReader.php', 'ZipDirectoryReaderError' => __DIR__ . '/includes/utils/ZipDirectoryReaderError.php', - 'concatenatedgziphistoryblob' => __DIR__ . '/includes/HistoryBlob.php', - 'historyblobcurstub' => __DIR__ . '/includes/HistoryBlob.php', - 'historyblobstub' => __DIR__ . '/includes/HistoryBlob.php', + 'concatenatedgziphistoryblob' => __DIR__ . '/includes/historyblob/ConcatenatedGzipHistoryBlob.php', + 'historyblobcurstub' => __DIR__ . '/includes/historyblob/HistoryBlobCurStub.php', + 'historyblobstub' => __DIR__ . '/includes/historyblob/HistoryBlobStub.php', 'profile_point' => __DIR__ . '/profileinfo.php', ]; diff --git a/includes/HistoryBlob.php b/includes/HistoryBlob.php deleted file mode 100644 index bca6c7e5bc..0000000000 --- a/includes/HistoryBlob.php +++ /dev/null @@ -1,711 +0,0 @@ -uncompress(); - $hash = md5( $text ); - if ( !isset( $this->mItems[$hash] ) ) { - $this->mItems[$hash] = $text; - $this->mSize += strlen( $text ); - } - return $hash; - } - - /** - * @param string $hash - * @return array|bool - */ - public function getItem( $hash ) { - $this->uncompress(); - if ( array_key_exists( $hash, $this->mItems ) ) { - return $this->mItems[$hash]; - } else { - return false; - } - } - - /** - * @param string $text - * @return void - */ - public function setText( $text ) { - $this->uncompress(); - $this->mDefaultHash = $this->addItem( $text ); - } - - /** - * @return array|bool - */ - public function getText() { - $this->uncompress(); - return $this->getItem( $this->mDefaultHash ); - } - - /** - * Remove an item - * - * @param string $hash - */ - public function removeItem( $hash ) { - $this->mSize -= strlen( $this->mItems[$hash] ); - unset( $this->mItems[$hash] ); - } - - /** - * Compress the bulk data in the object - */ - public function compress() { - if ( !$this->mCompressed ) { - $this->mItems = gzdeflate( serialize( $this->mItems ) ); - $this->mCompressed = true; - } - } - - /** - * Uncompress bulk data - */ - public function uncompress() { - if ( $this->mCompressed ) { - $this->mItems = unserialize( gzinflate( $this->mItems ) ); - $this->mCompressed = false; - } - } - - /** - * @return array - */ - function __sleep() { - $this->compress(); - return [ 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' ]; - } - - function __wakeup() { - $this->uncompress(); - } - - /** - * Helper function for compression jobs - * Returns true until the object is "full" and ready to be committed - * - * @return bool - */ - public function isHappy() { - return $this->mSize < $this->mMaxSize - && count( $this->mItems ) < $this->mMaxCount; - } -} - -/** - * Pointer object for an item within a CGZ blob stored in the text table. - */ -class HistoryBlobStub { - /** - * @var array One-step cache variable to hold base blobs; operations that - * pull multiple revisions may often pull multiple times from the same - * blob. By keeping the last-used one open, we avoid redundant - * unserialization and decompression overhead. - */ - protected static $blobCache = []; - - /** @var int */ - public $mOldId; - - /** @var string */ - public $mHash; - - /** @var string */ - public $mRef; - - /** - * @param string $hash The content hash of the text - * @param int $oldid The old_id for the CGZ object - */ - function __construct( $hash = '', $oldid = 0 ) { - $this->mHash = $hash; - } - - /** - * Sets the location (old_id) of the main object to which this object - * points - * @param int $id - */ - function setLocation( $id ) { - $this->mOldId = $id; - } - - /** - * Sets the location (old_id) of the referring object - * @param string $id - */ - function setReferrer( $id ) { - $this->mRef = $id; - } - - /** - * Gets the location of the referring object - * @return string - */ - function getReferrer() { - return $this->mRef; - } - - /** - * @return string|false - */ - function getText() { - if ( isset( self::$blobCache[$this->mOldId] ) ) { - $obj = self::$blobCache[$this->mOldId]; - } else { - $dbr = wfGetDB( DB_REPLICA ); - $row = $dbr->selectRow( - 'text', - [ 'old_flags', 'old_text' ], - [ 'old_id' => $this->mOldId ] - ); - - if ( !$row ) { - return false; - } - - $flags = explode( ',', $row->old_flags ); - if ( in_array( 'external', $flags ) ) { - $url = $row->old_text; - $parts = explode( '://', $url, 2 ); - if ( !isset( $parts[1] ) || $parts[1] == '' ) { - return false; - } - $row->old_text = ExternalStore::fetchFromURL( $url ); - - } - - if ( !in_array( 'object', $flags ) ) { - return false; - } - - if ( in_array( 'gzip', $flags ) ) { - // This shouldn't happen, but a bug in the compress script - // may at times gzip-compress a HistoryBlob object row. - $obj = unserialize( gzinflate( $row->old_text ) ); - } else { - $obj = unserialize( $row->old_text ); - } - - if ( !is_object( $obj ) ) { - // Correct for old double-serialization bug. - $obj = unserialize( $obj ); - } - - // Save this item for reference; if pulling many - // items in a row we'll likely use it again. - $obj->uncompress(); - self::$blobCache = [ $this->mOldId => $obj ]; - } - - return $obj->getItem( $this->mHash ); - } - - /** - * Get the content hash - * - * @return string - */ - function getHash() { - return $this->mHash; - } -} - -/** - * To speed up conversion from 1.4 to 1.5 schema, text rows can refer to the - * leftover cur table as the backend. This avoids expensively copying hundreds - * of megabytes of data during the conversion downtime. - * - * Serialized HistoryBlobCurStub objects will be inserted into the text table - * on conversion if $wgLegacySchemaConversion is set to true. - */ -class HistoryBlobCurStub { - /** @var int */ - public $mCurId; - - /** - * @param int $curid The cur_id pointed to - */ - function __construct( $curid = 0 ) { - $this->mCurId = $curid; - } - - /** - * Sets the location (cur_id) of the main object to which this object - * points - * - * @param int $id - */ - function setLocation( $id ) { - $this->mCurId = $id; - } - - /** - * @return string|bool - */ - function getText() { - $dbr = wfGetDB( DB_REPLICA ); - $row = $dbr->selectRow( 'cur', [ 'cur_text' ], [ 'cur_id' => $this->mCurId ] ); - if ( !$row ) { - return false; - } - return $row->cur_text; - } -} - -/** - * Diff-based history compression - * Requires xdiff 1.5+ and zlib - */ -class DiffHistoryBlob implements HistoryBlob { - /** @var array Uncompressed item cache */ - public $mItems = []; - - /** @var int Total uncompressed size */ - public $mSize = 0; - - /** - * @var array Array of diffs. If a diff D from A to B is notated D = B - A, - * and Z is an empty string: - * - * { item[map[i]] - item[map[i-1]] where i > 0 - * diff[i] = { - * { item[map[i]] - Z where i = 0 - */ - public $mDiffs; - - /** @var array The diff map, see above */ - public $mDiffMap; - - /** @var int The key for getText() - */ - public $mDefaultKey; - - /** @var string Compressed storage */ - public $mCompressed; - - /** @var bool True if the object is locked against further writes */ - public $mFrozen = false; - - /** - * @var int The maximum uncompressed size before the object becomes sad - * Should be less than max_allowed_packet - */ - public $mMaxSize = 10000000; - - /** @var int The maximum number of text items before the object becomes sad */ - public $mMaxCount = 100; - - /** Constants from xdiff.h */ - const XDL_BDOP_INS = 1; - const XDL_BDOP_CPY = 2; - const XDL_BDOP_INSB = 3; - - function __construct() { - if ( !function_exists( 'gzdeflate' ) ) { - throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" ); - } - } - - /** - * @throws MWException - * @param string $text - * @return int - */ - function addItem( $text ) { - if ( $this->mFrozen ) { - throw new MWException( __METHOD__ . ": Cannot add more items after sleep/wakeup" ); - } - - $this->mItems[] = $text; - $this->mSize += strlen( $text ); - $this->mDiffs = null; // later - return count( $this->mItems ) - 1; - } - - /** - * @param string $key - * @return string - */ - function getItem( $key ) { - return $this->mItems[$key]; - } - - /** - * @param string $text - */ - function setText( $text ) { - $this->mDefaultKey = $this->addItem( $text ); - } - - /** - * @return string - */ - function getText() { - return $this->getItem( $this->mDefaultKey ); - } - - /** - * @throws MWException - */ - function compress() { - if ( !function_exists( 'xdiff_string_rabdiff' ) ) { - throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" ); - } - if ( isset( $this->mDiffs ) ) { - // Already compressed - return; - } - if ( $this->mItems === [] ) { - return; - } - - // Create two diff sequences: one for main text and one for small text - $sequences = [ - 'small' => [ - 'tail' => '', - 'diffs' => [], - 'map' => [], - ], - 'main' => [ - 'tail' => '', - 'diffs' => [], - 'map' => [], - ], - ]; - $smallFactor = 0.5; - - $mItemsCount = count( $this->mItems ); - for ( $i = 0; $i < $mItemsCount; $i++ ) { - $text = $this->mItems[$i]; - if ( $i == 0 ) { - $seqName = 'main'; - } else { - $mainTail = $sequences['main']['tail']; - if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) { - $seqName = 'small'; - } else { - $seqName = 'main'; - } - } - $seq =& $sequences[$seqName]; - $tail = $seq['tail']; - $diff = $this->diff( $tail, $text ); - $seq['diffs'][] = $diff; - $seq['map'][] = $i; - $seq['tail'] = $text; - } - unset( $seq ); // unlink dangerous alias - - // Knit the sequences together - $tail = ''; - $this->mDiffs = []; - $this->mDiffMap = []; - foreach ( $sequences as $seq ) { - if ( $seq['diffs'] === [] ) { - continue; - } - if ( $tail === '' ) { - $this->mDiffs[] = $seq['diffs'][0]; - } else { - $head = $this->patch( '', $seq['diffs'][0] ); - $this->mDiffs[] = $this->diff( $tail, $head ); - } - $this->mDiffMap[] = $seq['map'][0]; - $diffsCount = count( $seq['diffs'] ); - for ( $i = 1; $i < $diffsCount; $i++ ) { - $this->mDiffs[] = $seq['diffs'][$i]; - $this->mDiffMap[] = $seq['map'][$i]; - } - $tail = $seq['tail']; - } - } - - /** - * @param string $t1 - * @param string $t2 - * @return string - */ - function diff( $t1, $t2 ) { - # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff - # "String is not zero-terminated" - Wikimedia\suppressWarnings(); - $diff = xdiff_string_rabdiff( $t1, $t2 ) . ''; - Wikimedia\restoreWarnings(); - return $diff; - } - - /** - * @param string $base - * @param string $diff - * @return bool|string - */ - function patch( $base, $diff ) { - if ( function_exists( 'xdiff_string_bpatch' ) ) { - Wikimedia\suppressWarnings(); - $text = xdiff_string_bpatch( $base, $diff ) . ''; - Wikimedia\restoreWarnings(); - return $text; - } - - # Pure PHP implementation - - $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) ); - - # Check the checksum if hash extension is available - $ofp = $this->xdiffAdler32( $base ); - if ( $ofp !== false && $ofp !== substr( $diff, 0, 4 ) ) { - wfDebug( __METHOD__ . ": incorrect base checksum\n" ); - return false; - } - if ( $header['csize'] != strlen( $base ) ) { - wfDebug( __METHOD__ . ": incorrect base length\n" ); - return false; - } - - $p = 8; - $out = ''; - while ( $p < strlen( $diff ) ) { - $x = unpack( 'Cop', substr( $diff, $p, 1 ) ); - $op = $x['op']; - ++$p; - switch ( $op ) { - case self::XDL_BDOP_INS: - $x = unpack( 'Csize', substr( $diff, $p, 1 ) ); - $p++; - $out .= substr( $diff, $p, $x['size'] ); - $p += $x['size']; - break; - case self::XDL_BDOP_INSB: - $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) ); - $p += 4; - $out .= substr( $diff, $p, $x['csize'] ); - $p += $x['csize']; - break; - case self::XDL_BDOP_CPY: - $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) ); - $p += 8; - $out .= substr( $base, $x['off'], $x['csize'] ); - break; - default: - wfDebug( __METHOD__ . ": invalid op\n" ); - return false; - } - } - return $out; - } - - /** - * Compute a binary "Adler-32" checksum as defined by LibXDiff, i.e. with - * the bytes backwards and initialised with 0 instead of 1. See T36428. - * - * @param string $s - * @return string|bool False if the hash extension is not available - */ - function xdiffAdler32( $s ) { - if ( !function_exists( 'hash' ) ) { - return false; - } - - static $init; - if ( $init === null ) { - $init = str_repeat( "\xf0", 205 ) . "\xee" . str_repeat( "\xf0", 67 ) . "\x02"; - } - - // The real Adler-32 checksum of $init is zero, so it initialises the - // state to zero, as it is at the start of LibXDiff's checksum - // algorithm. Appending the subject string then simulates LibXDiff. - return strrev( hash( 'adler32', $init . $s, true ) ); - } - - function uncompress() { - if ( !$this->mDiffs ) { - return; - } - $tail = ''; - $mDiffsCount = count( $this->mDiffs ); - for ( $diffKey = 0; $diffKey < $mDiffsCount; $diffKey++ ) { - $textKey = $this->mDiffMap[$diffKey]; - $text = $this->patch( $tail, $this->mDiffs[$diffKey] ); - $this->mItems[$textKey] = $text; - $tail = $text; - } - } - - /** - * @return array - */ - function __sleep() { - $this->compress(); - if ( $this->mItems === [] ) { - $info = false; - } else { - // Take forward differences to improve the compression ratio for sequences - $map = ''; - $prev = 0; - foreach ( $this->mDiffMap as $i ) { - if ( $map !== '' ) { - $map .= ','; - } - $map .= $i - $prev; - $prev = $i; - } - $info = [ - 'diffs' => $this->mDiffs, - 'map' => $map - ]; - } - if ( isset( $this->mDefaultKey ) ) { - $info['default'] = $this->mDefaultKey; - } - $this->mCompressed = gzdeflate( serialize( $info ) ); - return [ 'mCompressed' ]; - } - - function __wakeup() { - // addItem() doesn't work if mItems is partially filled from mDiffs - $this->mFrozen = true; - $info = unserialize( gzinflate( $this->mCompressed ) ); - unset( $this->mCompressed ); - - if ( !$info ) { - // Empty object - return; - } - - if ( isset( $info['default'] ) ) { - $this->mDefaultKey = $info['default']; - } - $this->mDiffs = $info['diffs']; - if ( isset( $info['base'] ) ) { - // Old format - $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 ); - array_unshift( $this->mDiffs, - pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) . - $info['base'] ); - } else { - // New format - $map = explode( ',', $info['map'] ); - $cur = 0; - $this->mDiffMap = []; - foreach ( $map as $i ) { - $cur += $i; - $this->mDiffMap[] = $cur; - } - } - $this->uncompress(); - } - - /** - * Helper function for compression jobs - * Returns true until the object is "full" and ready to be committed - * - * @return bool - */ - function isHappy() { - return $this->mSize < $this->mMaxSize - && count( $this->mItems ) < $this->mMaxCount; - } - -} - -// phpcs:ignore Generic.CodeAnalysis.UnconditionalIfStatement.Found -if ( false ) { - // Blobs generated by MediaWiki < 1.5 on PHP 4 were serialized with the - // class name coerced to lowercase. We can improve efficiency by adding - // autoload entries for the lowercase variants of these classes (T166759). - // The code below is never executed, but it is picked up by the AutoloadGenerator - // parser, which scans for class_alias() calls. - class_alias( ConcatenatedGzipHistoryBlob::class, 'concatenatedgziphistoryblob' ); - class_alias( HistoryBlobCurStub::class, 'historyblobcurstub' ); - class_alias( HistoryBlobStub::class, 'historyblobstub' ); -} diff --git a/includes/historyblob/ConcatenatedGzipHistoryBlob.php b/includes/historyblob/ConcatenatedGzipHistoryBlob.php new file mode 100644 index 0000000000..f6ca2f5a36 --- /dev/null +++ b/includes/historyblob/ConcatenatedGzipHistoryBlob.php @@ -0,0 +1,146 @@ +uncompress(); + $hash = md5( $text ); + if ( !isset( $this->mItems[$hash] ) ) { + $this->mItems[$hash] = $text; + $this->mSize += strlen( $text ); + } + return $hash; + } + + /** + * @param string $hash + * @return array|bool + */ + public function getItem( $hash ) { + $this->uncompress(); + if ( array_key_exists( $hash, $this->mItems ) ) { + return $this->mItems[$hash]; + } else { + return false; + } + } + + /** + * @param string $text + * @return void + */ + public function setText( $text ) { + $this->uncompress(); + $this->mDefaultHash = $this->addItem( $text ); + } + + /** + * @return array|bool + */ + public function getText() { + $this->uncompress(); + return $this->getItem( $this->mDefaultHash ); + } + + /** + * Remove an item + * + * @param string $hash + */ + public function removeItem( $hash ) { + $this->mSize -= strlen( $this->mItems[$hash] ); + unset( $this->mItems[$hash] ); + } + + /** + * Compress the bulk data in the object + */ + public function compress() { + if ( !$this->mCompressed ) { + $this->mItems = gzdeflate( serialize( $this->mItems ) ); + $this->mCompressed = true; + } + } + + /** + * Uncompress bulk data + */ + public function uncompress() { + if ( $this->mCompressed ) { + $this->mItems = unserialize( gzinflate( $this->mItems ) ); + $this->mCompressed = false; + } + } + + /** + * @return array + */ + function __sleep() { + $this->compress(); + return [ 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' ]; + } + + function __wakeup() { + $this->uncompress(); + } + + /** + * Helper function for compression jobs + * Returns true until the object is "full" and ready to be committed + * + * @return bool + */ + public function isHappy() { + return $this->mSize < $this->mMaxSize + && count( $this->mItems ) < $this->mMaxCount; + } +} + +// phpcs:ignore Generic.CodeAnalysis.UnconditionalIfStatement.Found +if ( false ) { + // Blobs generated by MediaWiki < 1.5 on PHP 4 were serialized with the + // class name coerced to lowercase. We can improve efficiency by adding + // autoload entries for the lowercase variants of these classes (T166759). + // The code below is never executed, but it is picked up by the AutoloadGenerator + // parser, which scans for class_alias() calls. + class_alias( ConcatenatedGzipHistoryBlob::class, 'concatenatedgziphistoryblob' ); +} diff --git a/includes/historyblob/DiffHistoryBlob.php b/includes/historyblob/DiffHistoryBlob.php new file mode 100644 index 0000000000..8d92fe5312 --- /dev/null +++ b/includes/historyblob/DiffHistoryBlob.php @@ -0,0 +1,377 @@ + 0 + * diff[i] = { + * { item[map[i]] - Z where i = 0 + */ + public $mDiffs; + + /** @var array The diff map, see above */ + public $mDiffMap; + + /** @var int The key for getText() + */ + public $mDefaultKey; + + /** @var string Compressed storage */ + public $mCompressed; + + /** @var bool True if the object is locked against further writes */ + public $mFrozen = false; + + /** + * @var int The maximum uncompressed size before the object becomes sad + * Should be less than max_allowed_packet + */ + public $mMaxSize = 10000000; + + /** @var int The maximum number of text items before the object becomes sad */ + public $mMaxCount = 100; + + /** Constants from xdiff.h */ + const XDL_BDOP_INS = 1; + const XDL_BDOP_CPY = 2; + const XDL_BDOP_INSB = 3; + + function __construct() { + if ( !function_exists( 'gzdeflate' ) ) { + throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" ); + } + } + + /** + * @throws MWException + * @param string $text + * @return int + */ + function addItem( $text ) { + if ( $this->mFrozen ) { + throw new MWException( __METHOD__ . ": Cannot add more items after sleep/wakeup" ); + } + + $this->mItems[] = $text; + $this->mSize += strlen( $text ); + $this->mDiffs = null; // later + return count( $this->mItems ) - 1; + } + + /** + * @param string $key + * @return string + */ + function getItem( $key ) { + return $this->mItems[$key]; + } + + /** + * @param string $text + */ + function setText( $text ) { + $this->mDefaultKey = $this->addItem( $text ); + } + + /** + * @return string + */ + function getText() { + return $this->getItem( $this->mDefaultKey ); + } + + /** + * @throws MWException + */ + function compress() { + if ( !function_exists( 'xdiff_string_rabdiff' ) ) { + throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" ); + } + if ( isset( $this->mDiffs ) ) { + // Already compressed + return; + } + if ( $this->mItems === [] ) { + return; + } + + // Create two diff sequences: one for main text and one for small text + $sequences = [ + 'small' => [ + 'tail' => '', + 'diffs' => [], + 'map' => [], + ], + 'main' => [ + 'tail' => '', + 'diffs' => [], + 'map' => [], + ], + ]; + $smallFactor = 0.5; + + $mItemsCount = count( $this->mItems ); + for ( $i = 0; $i < $mItemsCount; $i++ ) { + $text = $this->mItems[$i]; + if ( $i == 0 ) { + $seqName = 'main'; + } else { + $mainTail = $sequences['main']['tail']; + if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) { + $seqName = 'small'; + } else { + $seqName = 'main'; + } + } + $seq =& $sequences[$seqName]; + $tail = $seq['tail']; + $diff = $this->diff( $tail, $text ); + $seq['diffs'][] = $diff; + $seq['map'][] = $i; + $seq['tail'] = $text; + } + unset( $seq ); // unlink dangerous alias + + // Knit the sequences together + $tail = ''; + $this->mDiffs = []; + $this->mDiffMap = []; + foreach ( $sequences as $seq ) { + if ( $seq['diffs'] === [] ) { + continue; + } + if ( $tail === '' ) { + $this->mDiffs[] = $seq['diffs'][0]; + } else { + $head = $this->patch( '', $seq['diffs'][0] ); + $this->mDiffs[] = $this->diff( $tail, $head ); + } + $this->mDiffMap[] = $seq['map'][0]; + $diffsCount = count( $seq['diffs'] ); + for ( $i = 1; $i < $diffsCount; $i++ ) { + $this->mDiffs[] = $seq['diffs'][$i]; + $this->mDiffMap[] = $seq['map'][$i]; + } + $tail = $seq['tail']; + } + } + + /** + * @param string $t1 + * @param string $t2 + * @return string + */ + function diff( $t1, $t2 ) { + # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff + # "String is not zero-terminated" + Wikimedia\suppressWarnings(); + $diff = xdiff_string_rabdiff( $t1, $t2 ) . ''; + Wikimedia\restoreWarnings(); + return $diff; + } + + /** + * @param string $base + * @param string $diff + * @return bool|string + */ + function patch( $base, $diff ) { + if ( function_exists( 'xdiff_string_bpatch' ) ) { + Wikimedia\suppressWarnings(); + $text = xdiff_string_bpatch( $base, $diff ) . ''; + Wikimedia\restoreWarnings(); + return $text; + } + + # Pure PHP implementation + + $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) ); + + # Check the checksum if hash extension is available + $ofp = $this->xdiffAdler32( $base ); + if ( $ofp !== false && $ofp !== substr( $diff, 0, 4 ) ) { + wfDebug( __METHOD__ . ": incorrect base checksum\n" ); + return false; + } + if ( $header['csize'] != strlen( $base ) ) { + wfDebug( __METHOD__ . ": incorrect base length\n" ); + return false; + } + + $p = 8; + $out = ''; + while ( $p < strlen( $diff ) ) { + $x = unpack( 'Cop', substr( $diff, $p, 1 ) ); + $op = $x['op']; + ++$p; + switch ( $op ) { + case self::XDL_BDOP_INS: + $x = unpack( 'Csize', substr( $diff, $p, 1 ) ); + $p++; + $out .= substr( $diff, $p, $x['size'] ); + $p += $x['size']; + break; + case self::XDL_BDOP_INSB: + $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) ); + $p += 4; + $out .= substr( $diff, $p, $x['csize'] ); + $p += $x['csize']; + break; + case self::XDL_BDOP_CPY: + $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) ); + $p += 8; + $out .= substr( $base, $x['off'], $x['csize'] ); + break; + default: + wfDebug( __METHOD__ . ": invalid op\n" ); + return false; + } + } + return $out; + } + + /** + * Compute a binary "Adler-32" checksum as defined by LibXDiff, i.e. with + * the bytes backwards and initialised with 0 instead of 1. See T36428. + * + * @param string $s + * @return string|bool False if the hash extension is not available + */ + function xdiffAdler32( $s ) { + if ( !function_exists( 'hash' ) ) { + return false; + } + + static $init; + if ( $init === null ) { + $init = str_repeat( "\xf0", 205 ) . "\xee" . str_repeat( "\xf0", 67 ) . "\x02"; + } + + // The real Adler-32 checksum of $init is zero, so it initialises the + // state to zero, as it is at the start of LibXDiff's checksum + // algorithm. Appending the subject string then simulates LibXDiff. + return strrev( hash( 'adler32', $init . $s, true ) ); + } + + function uncompress() { + if ( !$this->mDiffs ) { + return; + } + $tail = ''; + $mDiffsCount = count( $this->mDiffs ); + for ( $diffKey = 0; $diffKey < $mDiffsCount; $diffKey++ ) { + $textKey = $this->mDiffMap[$diffKey]; + $text = $this->patch( $tail, $this->mDiffs[$diffKey] ); + $this->mItems[$textKey] = $text; + $tail = $text; + } + } + + /** + * @return array + */ + function __sleep() { + $this->compress(); + if ( $this->mItems === [] ) { + $info = false; + } else { + // Take forward differences to improve the compression ratio for sequences + $map = ''; + $prev = 0; + foreach ( $this->mDiffMap as $i ) { + if ( $map !== '' ) { + $map .= ','; + } + $map .= $i - $prev; + $prev = $i; + } + $info = [ + 'diffs' => $this->mDiffs, + 'map' => $map + ]; + } + if ( isset( $this->mDefaultKey ) ) { + $info['default'] = $this->mDefaultKey; + } + $this->mCompressed = gzdeflate( serialize( $info ) ); + return [ 'mCompressed' ]; + } + + function __wakeup() { + // addItem() doesn't work if mItems is partially filled from mDiffs + $this->mFrozen = true; + $info = unserialize( gzinflate( $this->mCompressed ) ); + unset( $this->mCompressed ); + + if ( !$info ) { + // Empty object + return; + } + + if ( isset( $info['default'] ) ) { + $this->mDefaultKey = $info['default']; + } + $this->mDiffs = $info['diffs']; + if ( isset( $info['base'] ) ) { + // Old format + $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 ); + array_unshift( $this->mDiffs, + pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) . + $info['base'] ); + } else { + // New format + $map = explode( ',', $info['map'] ); + $cur = 0; + $this->mDiffMap = []; + foreach ( $map as $i ) { + $cur += $i; + $this->mDiffMap[] = $cur; + } + } + $this->uncompress(); + } + + /** + * Helper function for compression jobs + * Returns true until the object is "full" and ready to be committed + * + * @return bool + */ + function isHappy() { + return $this->mSize < $this->mMaxSize + && count( $this->mItems ) < $this->mMaxCount; + } + +} diff --git a/includes/historyblob/HistoryBlob.php b/includes/historyblob/HistoryBlob.php new file mode 100644 index 0000000000..36c7c8f75e --- /dev/null +++ b/includes/historyblob/HistoryBlob.php @@ -0,0 +1,67 @@ +mCurId = $curid; + } + + /** + * Sets the location (cur_id) of the main object to which this object + * points + * + * @param int $id + */ + function setLocation( $id ) { + $this->mCurId = $id; + } + + /** + * @return string|bool + */ + function getText() { + $dbr = wfGetDB( DB_REPLICA ); + $row = $dbr->selectRow( 'cur', [ 'cur_text' ], [ 'cur_id' => $this->mCurId ] ); + if ( !$row ) { + return false; + } + return $row->cur_text; + } +} + +// phpcs:ignore Generic.CodeAnalysis.UnconditionalIfStatement.Found +if ( false ) { + // Blobs generated by MediaWiki < 1.5 on PHP 4 were serialized with the + // class name coerced to lowercase. We can improve efficiency by adding + // autoload entries for the lowercase variants of these classes (T166759). + // The code below is never executed, but it is picked up by the AutoloadGenerator + // parser, which scans for class_alias() calls. + class_alias( HistoryBlobCurStub::class, 'historyblobcurstub' ); +} diff --git a/includes/historyblob/HistoryBlobStub.php b/includes/historyblob/HistoryBlobStub.php new file mode 100644 index 0000000000..4995d3b3f0 --- /dev/null +++ b/includes/historyblob/HistoryBlobStub.php @@ -0,0 +1,150 @@ +mHash = $hash; + } + + /** + * Sets the location (old_id) of the main object to which this object + * points + * @param int $id + */ + function setLocation( $id ) { + $this->mOldId = $id; + } + + /** + * Sets the location (old_id) of the referring object + * @param string $id + */ + function setReferrer( $id ) { + $this->mRef = $id; + } + + /** + * Gets the location of the referring object + * @return string + */ + function getReferrer() { + return $this->mRef; + } + + /** + * @return string|false + */ + function getText() { + if ( isset( self::$blobCache[$this->mOldId] ) ) { + $obj = self::$blobCache[$this->mOldId]; + } else { + $dbr = wfGetDB( DB_REPLICA ); + $row = $dbr->selectRow( + 'text', + [ 'old_flags', 'old_text' ], + [ 'old_id' => $this->mOldId ] + ); + + if ( !$row ) { + return false; + } + + $flags = explode( ',', $row->old_flags ); + if ( in_array( 'external', $flags ) ) { + $url = $row->old_text; + $parts = explode( '://', $url, 2 ); + if ( !isset( $parts[1] ) || $parts[1] == '' ) { + return false; + } + $row->old_text = ExternalStore::fetchFromURL( $url ); + + } + + if ( !in_array( 'object', $flags ) ) { + return false; + } + + if ( in_array( 'gzip', $flags ) ) { + // This shouldn't happen, but a bug in the compress script + // may at times gzip-compress a HistoryBlob object row. + $obj = unserialize( gzinflate( $row->old_text ) ); + } else { + $obj = unserialize( $row->old_text ); + } + + if ( !is_object( $obj ) ) { + // Correct for old double-serialization bug. + $obj = unserialize( $obj ); + } + + // Save this item for reference; if pulling many + // items in a row we'll likely use it again. + $obj->uncompress(); + self::$blobCache = [ $this->mOldId => $obj ]; + } + + return $obj->getItem( $this->mHash ); + } + + /** + * Get the content hash + * + * @return string + */ + function getHash() { + return $this->mHash; + } +} + +// phpcs:ignore Generic.CodeAnalysis.UnconditionalIfStatement.Found +if ( false ) { + // Blobs generated by MediaWiki < 1.5 on PHP 4 were serialized with the + // class name coerced to lowercase. We can improve efficiency by adding + // autoload entries for the lowercase variants of these classes (T166759). + // The code below is never executed, but it is picked up by the AutoloadGenerator + // parser, which scans for class_alias() calls. + class_alias( HistoryBlobStub::class, 'historyblobstub' ); +}