From: Tim Starling Date: Tue, 15 Jan 2019 03:15:18 +0000 (+1100) Subject: Better detection for old MS Office files X-Git-Tag: 1.34.0-rc.0~3027^2 X-Git-Url: http://git.cyclocoop.org/%22%20.%20generer_url_ecrire%28%22auteur_infos%22%2C%20%22id_auteur=%24id%22%29%20.%20%22?a=commitdiff_plain;h=f12db3804882272794b4dafc72db1c5d787375bf;p=lhc%2Fweb%2Fwiklou.git Better detection for old MS Office files * Introduce MSCompoundFileReader, which reads the CFB directory and detects the file type from well-known names in the root directory * Do not detect a ZIP file if the EOCDR is not at the end. Other containers, especially CFB files, may contain ZIP files embedded within them in the last 64KB, but this is not a security concern unless the EOCDR is exactly at the end of the file. Bug: T40432 Change-Id: Id5b1a258ccf3c3c8951e32f6b7a5b1bafe941082 --- diff --git a/autoload.php b/autoload.php index 91be2e71b9..4481cd74ad 100644 --- a/autoload.php +++ b/autoload.php @@ -816,6 +816,7 @@ $wgAutoloadLocalClasses = [ 'LonelyPagesPage' => __DIR__ . '/includes/specials/SpecialLonelypages.php', 'LongPagesPage' => __DIR__ . '/includes/specials/SpecialLongpages.php', 'MIMEsearchPage' => __DIR__ . '/includes/specials/SpecialMIMEsearch.php', + 'MSCompoundFileReader' => __DIR__ . '/includes/libs/mime/MSCompoundFileReader.php', 'MWCallableUpdate' => __DIR__ . '/includes/deferred/MWCallableUpdate.php', 'MWContentSerializationException' => __DIR__ . '/includes/exception/MWContentSerializationException.php', 'MWCryptHKDF' => __DIR__ . '/includes/utils/MWCryptHKDF.php', diff --git a/includes/libs/mime/MSCompoundFileReader.php b/includes/libs/mime/MSCompoundFileReader.php new file mode 100644 index 0000000000..51407a76e0 --- /dev/null +++ b/includes/libs/mime/MSCompoundFileReader.php @@ -0,0 +1,355 @@ + 'application/vnd.ms-excel', + '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel', + '00020906-0000-0000-C000-000000000046' => 'application/msword', + '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint', + ]; + + /** + * Read a file by name + * + * @param string $fileName The full path to the file + * @return array An associative array of information about the file: + * - valid: true if the file is valid, false otherwise + * - error: An error message in English, should be present if valid=false + * - errorCode: One of the self::ERROR_* constants + * - mime: The MIME type detected from the directory contents + * - mimeFromClsid: The MIME type detected from the CLSID on the root + * directory entry + */ + public static function readFile( $fileName ) { + $handle = fopen( $fileName, 'r' ); + if ( $handle === false ) { + return [ + 'valid' => false, + 'error' => 'file does not exist', + 'errorCode' => self::ERROR_FILE_OPEN + ]; + } + return self::readHandle( $handle ); + } + + /** + * Read from an open seekable handle + * + * @param resource $fileHandle The file handle + * @return array An associative array of information about the file: + * - valid: true if the file is valid, false otherwise + * - error: An error message in English, should be present if valid=false + * - errorCode: One of the self::ERROR_* constants + * - mime: The MIME type detected from the directory contents + * - mimeFromClsid: The MIME type detected from the CLSID on the root + * directory entry + */ + public static function readHandle( $fileHandle ) { + $reader = new self( $fileHandle ); + $info = [ + 'valid' => $reader->valid, + 'mime' => $reader->mime, + 'mimeFromClsid' => $reader->mimeFromClsid + ]; + if ( $reader->error ) { + $info['error'] = $reader->error; + $info['errorCode'] = $reader->errorCode; + } + return $info; + } + + private function __construct( $fileHandle ) { + $this->file = $fileHandle; + try { + $this->init(); + } catch ( RuntimeException $e ) { + $this->valid = false; + $this->error = $e->getMessage(); + $this->errorCode = $e->getCode(); + } + } + + private function init() { + $this->header = $this->unpackOffset( 0, [ + 'header_signature' => 8, + 'header_clsid' => 16, + 'minor_version' => 2, + 'major_version' => 2, + 'byte_order' => 2, + 'sector_shift' => 2, + 'mini_sector_shift' => 2, + 'reserved' => 6, + 'num_dir_sectors' => 4, + 'num_fat_sectors' => 4, + 'first_dir_sector' => 4, + 'transaction_signature_number' => 4, + 'mini_stream_cutoff_size' => 4, + 'first_mini_fat_sector' => 4, + 'num_mini_fat_sectors' => 4, + 'first_difat_sector' => 4, + 'num_difat_sectors' => 4, + 'difat' => 436, + ] ); + if ( $this->header['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) { + $this->error( 'invalid signature: ' . bin2hex( $this->header['header_signature'] ), + self::ERROR_INVALID_SIGNATURE ); + } + $this->sectorLength = 1 << $this->header['sector_shift']; + $this->readDifat(); + $this->readDirectory(); + + $this->valid = true; + } + + private function sectorOffset( $sectorId ) { + return $this->sectorLength * ( $sectorId + 1 ); + } + + private function decodeClsid( $binaryClsid ) { + $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid ); + return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X", + $parts['a'], + $parts['b'], + $parts['c'], + $parts['d1'], + $parts['d2'], + $parts['d3'], + $parts['d4'], + $parts['d5'], + $parts['d6'], + $parts['d7'], + $parts['d8'] + ); + } + + private function unpackOffset( $offset, $struct ) { + $block = $this->readOffset( $offset, array_sum( $struct ) ); + return $this->unpack( $block, 0, $struct ); + } + + private function unpackSector( $sectorNumber, $struct ) { + $offset = $this->sectorOffset( $sectorNumber ); + return $this->unpackOffset( $offset, array_sum( $struct ) ); + } + + private function unpack( $block, $offset, $struct ) { + $data = []; + foreach ( $struct as $key => $length ) { + if ( $length > 4 ) { + $data[$key] = substr( $block, $offset, $length ); + } else { + $data[$key] = $this->bin2dec( $block, $offset, $length ); + } + $offset += $length; + } + return $data; + } + + private function bin2dec( $str, $offset, $length ) { + $value = 0; + for ( $i = $length - 1; $i >= 0; $i-- ) { + $value *= 256; + $value += ord( $str[$offset + $i] ); + } + return $value; + } + + private function readOffset( $offset, $length ) { + $this->fseek( $offset ); + Wikimedia\suppressWarnings(); + $block = fread( $this->file, $length ); + Wikimedia\restoreWarnings(); + if ( $block === false ) { + $this->error( 'error reading from file', self::ERROR_READ ); + } + if ( strlen( $block ) !== $length ) { + $this->error( 'unable to read the required number of bytes from the file', + self::ERROR_READ_PAST_END ); + } + return $block; + } + + private function readSector( $sectorId ) { + return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header['sector_shift'] ); + } + + private function error( $message, $code ) { + throw new RuntimeException( $message, $code ); + } + + private function fseek( $offset ) { + Wikimedia\suppressWarnings(); + $result = fseek( $this->file, $offset ); + Wikimedia\restoreWarnings(); + if ( $result !== 0 ) { + $this->error( "unable to seek to offset $offset", self::ERROR_SEEK ); + } + } + + private function readDifat() { + $binaryDifat = $this->header['difat']; + $nextDifatSector = $this->header['first_difat_sector']; + for ( $i = 0; $i < $this->header['num_difat_sectors']; $i++ ) { + $block = $this->readSector( $nextDifatSector ); + $binaryDifat .= substr( $block, 0, $this->sectorLength - 4 ); + $nextDifatSector = $this->bin2dec( $block, $this->sectorLength - 4, 4 ); + if ( $nextDifatSector == 0xFFFFFFFE ) { + break; + } + } + + $this->difat = []; + for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos += 4 ) { + $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 ); + if ( $fatSector < 0xFFFFFFFC ) { + $this->difat[] = $fatSector; + } else { + break; + } + } + } + + private function getNextSectorIdFromFat( $sectorId ) { + $entriesPerSector = intdiv( $this->sectorLength, 4 ); + $fatSectorId = intdiv( $sectorId, $entriesPerSector ); + $fatSectorArray = $this->getFatSector( $fatSectorId ); + return $fatSectorArray[$sectorId % $entriesPerSector]; + } + + private function getFatSector( $fatSectorId ) { + if ( !isset( $this->fat[$fatSectorId] ) ) { + $fat = []; + if ( !isset( $this->difat[$fatSectorId] ) ) { + $this->error( 'FAT sector requested beyond the end of the DIFAT', self::ERROR_INVALID_FORMAT ); + } + $absoluteSectorId = $this->difat[$fatSectorId]; + $block = $this->readSector( $absoluteSectorId ); + for ( $pos = 0; $pos < strlen( $block ); $pos += 4 ) { + $fat[] = $this->bin2dec( $block, $pos, 4 ); + } + $this->fat[$fatSectorId] = $fat; + } + return $this->fat[$fatSectorId]; + } + + private function readDirectory() { + $dirSectorId = $this->header['first_dir_sector']; + $binaryDir = ''; + $seenSectorIds = []; + while ( $dirSectorId !== 0xFFFFFFFE ) { + if ( isset( $seenSectorIds[$dirSectorId] ) ) { + $this->error( 'FAT loop detected', self::ERROR_INVALID_FORMAT ); + } + $seenSectorIds[$dirSectorId] = true; + + $binaryDir .= $this->readSector( $dirSectorId ); + $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId ); + } + + $struct = [ + 'name_raw' => 64, + 'name_length' => 2, + 'object_type' => 1, + 'color' => 1, + 'sid_left' => 4, + 'sid_right' => 4, + 'sid_child' => 4, + 'clsid' => 16, + 'state_bits' => 4, + 'create_time_low' => 4, + 'create_time_high' => 4, + 'modify_time_low' => 4, + 'modify_time_high' => 4, + 'first_sector' => 4, + 'size_low' => 4, + 'size_high' => 4, + ]; + $entryLength = array_sum( $struct ); + + for ( $pos = 0; $pos < strlen( $binaryDir ); $pos += $entryLength ) { + $entry = $this->unpack( $binaryDir, $pos, $struct ); + + // According to [MS-CFB] size_high may contain garbage due to a + // bug in a writer, it's best to pretend it is zero + $entry['size_high'] = 0; + + $type = $entry['object_type']; + if ( $type == self::TYPE_UNALLOCATED ) { + continue; + } + + $name = iconv( 'UTF-16', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) ); + + $clsid = $this->decodeClsid( $entry['clsid'] ); + if ( $type == self::TYPE_ROOT && isset( self::$mimesByClsid[$clsid] ) ) { + $this->mimeFromClsid = self::$mimesByClsid[$clsid]; + } + + if ( $name === 'Workbook' ) { + $this->mime = 'application/vnd.ms-excel'; + } elseif ( $name === 'WordDocument' ) { + $this->mime = 'application/msword'; + } elseif ( $name === 'PowerPoint Document' ) { + $this->mime = 'application/vnd.ms-powerpoint'; + } + } + } +} diff --git a/includes/libs/mime/MimeAnalyzer.php b/includes/libs/mime/MimeAnalyzer.php index b93084ad5c..a2075dc367 100644 --- a/includes/libs/mime/MimeAnalyzer.php +++ b/includes/libs/mime/MimeAnalyzer.php @@ -653,7 +653,6 @@ EOT; "Seeking $tailLength bytes from EOF failed in " . __METHOD__ ); } $tail = $tailLength ? fread( $f, $tailLength ) : ''; - fclose( $f ); $this->logger->info( __METHOD__ . ": analyzing head and tail of $file for magic numbers.\n" ); @@ -724,6 +723,12 @@ EOT; return "image/webp"; } + /* Look for MS Compound Binary (OLE) files */ + if ( strncmp( $head, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8 ) == 0 ) { + $this->logger->info( __METHOD__ . ': recognized MS CFB (OLE) file' ); + return $this->detectMicrosoftBinaryType( $f ); + } + /** * Look for PHP. Check for this before HTML/XML... Warning: this is a * heuristic, and won't match a file with a lot of non-PHP before. It @@ -797,9 +802,16 @@ EOT; } // Check for ZIP variants (before getimagesize) - if ( strpos( $tail, "PK\x05\x06" ) !== false ) { - $this->logger->info( __METHOD__ . ": ZIP header present in $file\n" ); - return $this->detectZipType( $head, $tail, $ext ); + $eocdrPos = strpos( $tail, "PK\x05\x06" ); + if ( $eocdrPos !== false ) { + $this->logger->info( __METHOD__ . ": ZIP signature present in $file\n" ); + // Check if it really is a ZIP file, make sure the EOCDR is at the end (T40432) + $commentLength = unpack( "n", $tail, $eocdrPos + 20 )[0]; + if ( $eocdrPos + 22 + $commentLength !== strlen( $tail ) ) { + $this->logger->info( __METHOD__ . ": ZIP EOCDR not at end. Not a ZIP file." ); + } else { + return $this->detectZipType( $head, $tail, $ext ); + } } // Check for STL (3D) files @@ -945,6 +957,26 @@ EOT; return $mime; } + /** + * Detect the type of a Microsoft Compound Binary a.k.a. OLE file. + * These are old style pre-ODF files such as .doc and .xls + * + * @param resource $handle An opened seekable file handle + * @return string The detected MIME type + */ + function detectMicrosoftBinaryType( $handle ) { + $info = MSCompoundFileReader::readHandle( $handle ); + if ( !$info['valid'] ) { + $this->logger->info( __METHOD__ . ': invalid file format' ); + return 'unknown/unknown'; + } + if ( !$info['mime'] ) { + $this->logger->info( __METHOD__ . ": unrecognised document subtype" ); + return 'unknown/unknown'; + } + return $info['mime']; + } + /** * Internal MIME type detection. Detection is done using the fileinfo * extension if it is available. It can be overriden by callback, which could diff --git a/includes/utils/ZipDirectoryReader.php b/includes/utils/ZipDirectoryReader.php index 46f1aaab47..e7846f43e2 100644 --- a/includes/utils/ZipDirectoryReader.php +++ b/includes/utils/ZipDirectoryReader.php @@ -228,7 +228,9 @@ class ZipDirectoryReader { $this->eocdr['EOCDR size'] = $structSize + $this->eocdr['file comment length']; if ( $structSize + $this->eocdr['file comment length'] != strlen( $block ) - $sigPos ) { - $this->error( 'zip-bad', 'trailing bytes after the end of the file comment' ); + // T40432: MS binary documents frequently embed ZIP files + $this->error( 'zip-wrong-format', 'there is a ZIP signature but it is not at ' . + 'the end of the file. It could be an OLE file with a ZIP file embedded.' ); } if ( $this->eocdr['disk'] !== 0 || $this->eocdr['CD start disk'] !== 0 diff --git a/tests/phpunit/data/MSCompoundFileReader/calc.xls b/tests/phpunit/data/MSCompoundFileReader/calc.xls new file mode 100644 index 0000000000..f15e2c1aed Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/calc.xls differ diff --git a/tests/phpunit/data/MSCompoundFileReader/dir-beyond-end.xls b/tests/phpunit/data/MSCompoundFileReader/dir-beyond-end.xls new file mode 100644 index 0000000000..b2aebf4427 Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/dir-beyond-end.xls differ diff --git a/tests/phpunit/data/MSCompoundFileReader/excel2016-compat97.xls b/tests/phpunit/data/MSCompoundFileReader/excel2016-compat97.xls new file mode 100755 index 0000000000..8895407d50 Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/excel2016-compat97.xls differ diff --git a/tests/phpunit/data/MSCompoundFileReader/fat-loop.xls b/tests/phpunit/data/MSCompoundFileReader/fat-loop.xls new file mode 100644 index 0000000000..ef8e1adbe2 Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/fat-loop.xls differ diff --git a/tests/phpunit/data/MSCompoundFileReader/gnumeric.xls b/tests/phpunit/data/MSCompoundFileReader/gnumeric.xls new file mode 100644 index 0000000000..4e4995d493 Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/gnumeric.xls differ diff --git a/tests/phpunit/data/MSCompoundFileReader/impress.ppt b/tests/phpunit/data/MSCompoundFileReader/impress.ppt new file mode 100644 index 0000000000..8e39c4b758 Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/impress.ppt differ diff --git a/tests/phpunit/data/MSCompoundFileReader/invalid-signature.xls b/tests/phpunit/data/MSCompoundFileReader/invalid-signature.xls new file mode 100644 index 0000000000..b8d9c2c50d Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/invalid-signature.xls differ diff --git a/tests/phpunit/data/MSCompoundFileReader/powerpoint2016-compat97.ppt b/tests/phpunit/data/MSCompoundFileReader/powerpoint2016-compat97.ppt new file mode 100755 index 0000000000..1fcafbba85 Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/powerpoint2016-compat97.ppt differ diff --git a/tests/phpunit/data/MSCompoundFileReader/word2016-compat97.doc b/tests/phpunit/data/MSCompoundFileReader/word2016-compat97.doc new file mode 100755 index 0000000000..edd25e9722 Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/word2016-compat97.doc differ diff --git a/tests/phpunit/data/MSCompoundFileReader/writer.doc b/tests/phpunit/data/MSCompoundFileReader/writer.doc new file mode 100644 index 0000000000..b51df91ce9 Binary files /dev/null and b/tests/phpunit/data/MSCompoundFileReader/writer.doc differ diff --git a/tests/phpunit/data/media/zip-in-doc.doc b/tests/phpunit/data/media/zip-in-doc.doc new file mode 100644 index 0000000000..0b4c574eb0 Binary files /dev/null and b/tests/phpunit/data/media/zip-in-doc.doc differ diff --git a/tests/phpunit/includes/libs/mime/MSCompoundFileReaderTest.php b/tests/phpunit/includes/libs/mime/MSCompoundFileReaderTest.php new file mode 100644 index 0000000000..4509a61eb7 --- /dev/null +++ b/tests/phpunit/includes/libs/mime/MSCompoundFileReaderTest.php @@ -0,0 +1,60 @@ +assertTrue( $info['valid'] ); + $this->assertSame( $expectedMime, $info['mime'] ); + } + + public static function provideInvalid() { + return [ + [ 'dir-beyond-end.xls', 'ERROR_READ_PAST_END' ], + [ 'fat-loop.xls', 'ERROR_INVALID_FORMAT' ], + [ 'invalid-signature.xls', 'ERROR_INVALID_SIGNATURE' ], + ]; + } + + /** @dataProvider provideInvalid */ + public function testReadFileInvalid( $fileName, $expectedError ) { + global $IP; + + $info = MSCompoundFileReader::readFile( "$IP/tests/phpunit/data/MSCompoundFileReader/$fileName" ); + $this->assertFalse( $info['valid'] ); + $this->assertSame( constant( MSCompoundFileReader::class . '::' . $expectedError ), + $info['errorCode'] ); + } +} diff --git a/tests/phpunit/includes/libs/mime/MimeAnalyzerTest.php b/tests/phpunit/includes/libs/mime/MimeAnalyzerTest.php index fbe5a2bada..194781207e 100644 --- a/tests/phpunit/includes/libs/mime/MimeAnalyzerTest.php +++ b/tests/phpunit/includes/libs/mime/MimeAnalyzerTest.php @@ -128,4 +128,13 @@ class MimeAnalyzerTest extends PHPUnit\Framework\TestCase { $actualType = $this->doGuessMimeType( [ $file, 'mp3' ] ); $this->assertEquals( 'audio/mpeg', $actualType ); } + + /** + * A ZIP file embedded in the middle of a .doc file is still a Word Document. + */ + function testZipInDoc() { + $file = __DIR__ . '/../../../data/media/zip-in-doc.doc'; + $actualType = $this->doGuessMimeType( [ $file, 'doc' ] ); + $this->assertEquals( 'application/msword', $actualType ); + } } diff --git a/tests/phpunit/includes/utils/ZipDirectoryReaderTest.php b/tests/phpunit/includes/utils/ZipDirectoryReaderTest.php index 9f18e5af35..a1a3fd7309 100644 --- a/tests/phpunit/includes/utils/ZipDirectoryReaderTest.php +++ b/tests/phpunit/includes/utils/ZipDirectoryReaderTest.php @@ -61,7 +61,8 @@ class ZipDirectoryReaderTest extends PHPUnit\Framework\TestCase { } public function testTrailingBytes() { - $this->readZipAssertError( 'trail.zip', 'zip-bad', + // Due to T40432 this is now zip-wrong-format instead of zip-bad + $this->readZipAssertError( 'trail.zip', 'zip-wrong-format', 'Trailing bytes error' ); }