3 * Copyright 2019 Wikimedia Foundation
5 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6 * not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software distributed
12 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
13 * OF ANY KIND, either express or implied. See the License for the
14 * specific language governing permissions and limitations under the License.
18 * Read the directory of a Microsoft Compound File Binary file, a.k.a. an OLE
19 * file, and detect the MIME type.
22 * - MS-CFB https://msdn.microsoft.com/en-us/library/dd942138.aspx
23 * - MS-XLS https://msdn.microsoft.com/en-us/library/cc313154.aspx
24 * - MS-PPT https://msdn.microsoft.com/en-us/library/cc313106.aspx
25 * - MS-DOC https://msdn.microsoft.com/en-us/library/cc313153.aspx
26 * - Python olefile https://github.com/decalage2/olefile
27 * - OpenOffice.org's Documentation of the Microsoft Compound Document
28 * File Format https://www.openoffice.org/sc/compdocfileformat.pdf
32 class MSCompoundFileReader
{
36 private $mimeFromClsid;
39 private $valid = false;
41 private $sectorLength;
46 const TYPE_UNALLOCATED
= 0;
47 const TYPE_STORAGE
= 1;
48 const TYPE_STREAM
= 2;
51 const ERROR_FILE_OPEN
= 1;
54 const ERROR_INVALID_SIGNATURE
= 4;
55 const ERROR_READ_PAST_END
= 5;
56 const ERROR_INVALID_FORMAT
= 6;
58 private static $mimesByClsid = [
59 // From http://justsolve.archiveteam.org/wiki/Microsoft_Compound_File
60 '00020810-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
61 '00020820-0000-0000-C000-000000000046' => 'application/vnd.ms-excel',
62 '00020906-0000-0000-C000-000000000046' => 'application/msword',
63 '64818D10-4F9B-11CF-86EA-00AA00B929E8' => 'application/vnd.ms-powerpoint',
69 * @param string $fileName The full path to the file
70 * @return array An associative array of information about the file:
71 * - valid: true if the file is valid, false otherwise
72 * - error: An error message in English, should be present if valid=false
73 * - errorCode: One of the self::ERROR_* constants
74 * - mime: The MIME type detected from the directory contents
75 * - mimeFromClsid: The MIME type detected from the CLSID on the root
78 public static function readFile( $fileName ) {
79 $handle = fopen( $fileName, 'r' );
80 if ( $handle === false ) {
83 'error' => 'file does not exist',
84 'errorCode' => self
::ERROR_FILE_OPEN
87 return self
::readHandle( $handle );
91 * Read from an open seekable handle
93 * @param resource $fileHandle The file handle
94 * @return array An associative array of information about the file:
95 * - valid: true if the file is valid, false otherwise
96 * - error: An error message in English, should be present if valid=false
97 * - errorCode: One of the self::ERROR_* constants
98 * - mime: The MIME type detected from the directory contents
99 * - mimeFromClsid: The MIME type detected from the CLSID on the root
102 public static function readHandle( $fileHandle ) {
103 $reader = new self( $fileHandle );
105 'valid' => $reader->valid
,
106 'mime' => $reader->mime
,
107 'mimeFromClsid' => $reader->mimeFromClsid
109 if ( $reader->error
) {
110 $info['error'] = $reader->error
;
111 $info['errorCode'] = $reader->errorCode
;
116 private function __construct( $fileHandle ) {
117 $this->file
= $fileHandle;
120 } catch ( RuntimeException
$e ) {
121 $this->valid
= false;
122 $this->error
= $e->getMessage();
123 $this->errorCode
= $e->getCode();
127 private function init() {
128 $this->header
= $this->unpackOffset( 0, [
129 'header_signature' => 8,
130 'header_clsid' => 16,
131 'minor_version' => 2,
132 'major_version' => 2,
135 'mini_sector_shift' => 2,
137 'num_dir_sectors' => 4,
138 'num_fat_sectors' => 4,
139 'first_dir_sector' => 4,
140 'transaction_signature_number' => 4,
141 'mini_stream_cutoff_size' => 4,
142 'first_mini_fat_sector' => 4,
143 'num_mini_fat_sectors' => 4,
144 'first_difat_sector' => 4,
145 'num_difat_sectors' => 4,
148 if ( $this->header
['header_signature'] !== "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" ) {
149 $this->error( 'invalid signature: ' . bin2hex( $this->header
['header_signature'] ),
150 self
::ERROR_INVALID_SIGNATURE
);
152 $this->sectorLength
= 1 << $this->header
['sector_shift'];
154 $this->readDirectory();
159 private function sectorOffset( $sectorId ) {
160 return $this->sectorLength
* ( $sectorId +
1 );
163 private function decodeClsid( $binaryClsid ) {
164 $parts = unpack( 'Va/vb/vc/C8d', $binaryClsid );
165 return sprintf( "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X",
180 private function unpackOffset( $offset, $struct ) {
181 $block = $this->readOffset( $offset, array_sum( $struct ) );
182 return $this->unpack( $block, 0, $struct );
185 private function unpackSector( $sectorNumber, $struct ) {
186 $offset = $this->sectorOffset( $sectorNumber );
187 return $this->unpackOffset( $offset, array_sum( $struct ) );
190 private function unpack( $block, $offset, $struct ) {
192 foreach ( $struct as $key => $length ) {
194 $data[$key] = substr( $block, $offset, $length );
196 $data[$key] = $this->bin2dec( $block, $offset, $length );
203 private function bin2dec( $str, $offset, $length ) {
205 for ( $i = $length - 1; $i >= 0; $i-- ) {
207 $value +
= ord( $str[$offset +
$i] );
212 private function readOffset( $offset, $length ) {
213 $this->fseek( $offset );
214 Wikimedia\
suppressWarnings();
215 $block = fread( $this->file
, $length );
216 Wikimedia\restoreWarnings
();
217 if ( $block === false ) {
218 $this->error( 'error reading from file', self
::ERROR_READ
);
220 if ( strlen( $block ) !== $length ) {
221 $this->error( 'unable to read the required number of bytes from the file',
222 self
::ERROR_READ_PAST_END
);
227 private function readSector( $sectorId ) {
228 return $this->readOffset( $this->sectorOffset( $sectorId ), 1 << $this->header
['sector_shift'] );
231 private function error( $message, $code ) {
232 throw new RuntimeException( $message, $code );
235 private function fseek( $offset ) {
236 Wikimedia\
suppressWarnings();
237 $result = fseek( $this->file
, $offset );
238 Wikimedia\restoreWarnings
();
239 if ( $result !== 0 ) {
240 $this->error( "unable to seek to offset $offset", self
::ERROR_SEEK
);
244 private function readDifat() {
245 $binaryDifat = $this->header
['difat'];
246 $nextDifatSector = $this->header
['first_difat_sector'];
247 for ( $i = 0; $i < $this->header
['num_difat_sectors']; $i++
) {
248 $block = $this->readSector( $nextDifatSector );
249 $binaryDifat .= substr( $block, 0, $this->sectorLength
- 4 );
250 $nextDifatSector = $this->bin2dec( $block, $this->sectorLength
- 4, 4 );
251 if ( $nextDifatSector == 0xFFFFFFFE ) {
257 for ( $pos = 0; $pos < strlen( $binaryDifat ); $pos +
= 4 ) {
258 $fatSector = $this->bin2dec( $binaryDifat, $pos, 4 );
259 if ( $fatSector < 0xFFFFFFFC ) {
260 $this->difat
[] = $fatSector;
267 private function getNextSectorIdFromFat( $sectorId ) {
268 $entriesPerSector = intdiv( $this->sectorLength
, 4 );
269 $fatSectorId = intdiv( $sectorId, $entriesPerSector );
270 $fatSectorArray = $this->getFatSector( $fatSectorId );
271 return $fatSectorArray[$sectorId %
$entriesPerSector];
274 private function getFatSector( $fatSectorId ) {
275 if ( !isset( $this->fat
[$fatSectorId] ) ) {
277 if ( !isset( $this->difat
[$fatSectorId] ) ) {
278 $this->error( 'FAT sector requested beyond the end of the DIFAT', self
::ERROR_INVALID_FORMAT
);
280 $absoluteSectorId = $this->difat
[$fatSectorId];
281 $block = $this->readSector( $absoluteSectorId );
282 for ( $pos = 0; $pos < strlen( $block ); $pos +
= 4 ) {
283 $fat[] = $this->bin2dec( $block, $pos, 4 );
285 $this->fat
[$fatSectorId] = $fat;
287 return $this->fat
[$fatSectorId];
290 private function readDirectory() {
291 $dirSectorId = $this->header
['first_dir_sector'];
294 while ( $dirSectorId !== 0xFFFFFFFE ) {
295 if ( isset( $seenSectorIds[$dirSectorId] ) ) {
296 $this->error( 'FAT loop detected', self
::ERROR_INVALID_FORMAT
);
298 $seenSectorIds[$dirSectorId] = true;
300 $binaryDir .= $this->readSector( $dirSectorId );
301 $dirSectorId = $this->getNextSectorIdFromFat( $dirSectorId );
314 'create_time_low' => 4,
315 'create_time_high' => 4,
316 'modify_time_low' => 4,
317 'modify_time_high' => 4,
322 $entryLength = array_sum( $struct );
324 for ( $pos = 0; $pos < strlen( $binaryDir ); $pos +
= $entryLength ) {
325 $entry = $this->unpack( $binaryDir, $pos, $struct );
327 // According to [MS-CFB] size_high may contain garbage due to a
328 // bug in a writer, it's best to pretend it is zero
329 $entry['size_high'] = 0;
331 $type = $entry['object_type'];
332 if ( $type == self
::TYPE_UNALLOCATED
) {
336 $name = iconv( 'UTF-16', 'UTF-8', substr( $entry['name_raw'], 0, $entry['name_length'] - 2 ) );
338 $clsid = $this->decodeClsid( $entry['clsid'] );
339 if ( $type == self
::TYPE_ROOT
&& isset( self
::$mimesByClsid[$clsid] ) ) {
340 $this->mimeFromClsid
= self
::$mimesByClsid[$clsid];
343 if ( $name === 'Workbook' ) {
344 $this->mime
= 'application/vnd.ms-excel';
345 } elseif ( $name === 'WordDocument' ) {
346 $this->mime
= 'application/msword';
347 } elseif ( $name === 'PowerPoint Document' ) {
348 $this->mime
= 'application/vnd.ms-powerpoint';