-<?php\r
-\r
-/**\r
- * This class simulates Microsoft Internet Explorer's terribly broken and \r
- * insecure MIME type detection algorithm. It can be used to check web uploads\r
- * with an apparently safe type, to see if IE will reinterpret them to produce \r
- * something dangerous.\r
- *\r
- * It is full of bugs and strange design choices should not under any \r
- * circumstances be used to determine a MIME type to present to a user or \r
- * client. (Apple Safari developers, this means you too.)\r
- *\r
- * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have \r
- * attempted to ensure that this code works in exactly the same way as Internet \r
- * Explorer, it does not share any source code, or creative choices such as \r
- * variable names, thus I (Tim Starling) claim copyright on it. \r
- *\r
- * It may be redistributed without restriction. To aid reuse, this class does\r
- * not depend on any MediaWiki module.\r
- */\r
-class IEContentAnalyzer {\r
- /**\r
- * Relevant data taken from the type table in IE 5\r
- */\r
- protected $baseTypeTable = array(\r
- 'ambiguous' /*1*/ => array(\r
- 'text/plain', \r
- 'application/octet-stream', \r
- 'application/x-netcdf', // [sic]\r
- ),\r
- 'text' /*3*/ => array(\r
- 'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',\r
- 'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'\r
- ),\r
- 'binary' /*4*/ => array(\r
- 'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',\r
- 'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp', \r
- 'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi', \r
- 'video/x-msvideo', 'video/mpeg', 'application/x-compressed',\r
- 'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',\r
- 'application/x-msdownload'\r
- ),\r
- 'html' /*5*/ => array( 'text/html' ),\r
- );\r
-\r
- /**\r
- * Changes to the type table in later versions of IE\r
- */\r
- protected $addedTypes = array(\r
- 'ie07' => array(\r
- 'text' => array( 'text/xml', 'application/xml' )\r
- ),\r
- );\r
-\r
- /**\r
- * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a\r
- * typical Windows installation.\r
- *\r
- * Used for extension to MIME type mapping if detection fails.\r
- */\r
- protected $registry = array(\r
- '.323' => 'text/h323',\r
- '.3g2' => 'video/3gpp2',\r
- '.3gp' => 'video/3gpp',\r
- '.3gp2' => 'video/3gpp2',\r
- '.3gpp' => 'video/3gpp',\r
- '.aac' => 'audio/aac',\r
- '.ac3' => 'audio/ac3',\r
- '.accda' => 'application/msaccess',\r
- '.accdb' => 'application/msaccess',\r
- '.accdc' => 'application/msaccess',\r
- '.accde' => 'application/msaccess',\r
- '.accdr' => 'application/msaccess',\r
- '.accdt' => 'application/msaccess',\r
- '.ade' => 'application/msaccess',\r
- '.adp' => 'application/msaccess',\r
- '.adts' => 'audio/aac',\r
- '.ai' => 'application/postscript',\r
- '.aif' => 'audio/aiff',\r
- '.aifc' => 'audio/aiff',\r
- '.aiff' => 'audio/aiff',\r
- '.amc' => 'application/x-mpeg',\r
- '.application' => 'application/x-ms-application',\r
- '.asf' => 'video/x-ms-asf',\r
- '.asx' => 'video/x-ms-asf',\r
- '.au' => 'audio/basic',\r
- '.avi' => 'video/avi',\r
- '.bmp' => 'image/bmp',\r
- '.caf' => 'audio/x-caf',\r
- '.cat' => 'application/vnd.ms-pki.seccat',\r
- '.cbo' => 'application/sha',\r
- '.cdda' => 'audio/aiff',\r
- '.cer' => 'application/x-x509-ca-cert',\r
- '.conf' => 'text/plain',\r
- '.crl' => 'application/pkix-crl',\r
- '.crt' => 'application/x-x509-ca-cert',\r
- '.css' => 'text/css',\r
- '.csv' => 'application/vnd.ms-excel',\r
- '.der' => 'application/x-x509-ca-cert',\r
- '.dib' => 'image/bmp',\r
- '.dif' => 'video/x-dv',\r
- '.dll' => 'application/x-msdownload',\r
- '.doc' => 'application/msword',\r
- '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',\r
- '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',\r
- '.dot' => 'application/msword',\r
- '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',\r
- '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',\r
- '.dv' => 'video/x-dv',\r
- '.dwfx' => 'model/vnd.dwfx+xps',\r
- '.edn' => 'application/vnd.adobe.edn',\r
- '.eml' => 'message/rfc822',\r
- '.eps' => 'application/postscript',\r
- '.etd' => 'application/x-ebx',\r
- '.exe' => 'application/x-msdownload',\r
- '.fdf' => 'application/vnd.fdf',\r
- '.fif' => 'application/fractals',\r
- '.gif' => 'image/gif',\r
- '.gsm' => 'audio/x-gsm',\r
- '.hqx' => 'application/mac-binhex40',\r
- '.hta' => 'application/hta',\r
- '.htc' => 'text/x-component',\r
- '.htm' => 'text/html',\r
- '.html' => 'text/html',\r
- '.htt' => 'text/webviewhtml',\r
- '.hxa' => 'application/xml',\r
- '.hxc' => 'application/xml',\r
- '.hxd' => 'application/octet-stream',\r
- '.hxe' => 'application/xml',\r
- '.hxf' => 'application/xml',\r
- '.hxh' => 'application/octet-stream',\r
- '.hxi' => 'application/octet-stream',\r
- '.hxk' => 'application/xml',\r
- '.hxq' => 'application/octet-stream',\r
- '.hxr' => 'application/octet-stream',\r
- '.hxs' => 'application/octet-stream',\r
- '.hxt' => 'application/xml',\r
- '.hxv' => 'application/xml',\r
- '.hxw' => 'application/octet-stream',\r
- '.ico' => 'image/x-icon',\r
- '.iii' => 'application/x-iphone',\r
- '.ins' => 'application/x-internet-signup',\r
- '.iqy' => 'text/x-ms-iqy',\r
- '.isp' => 'application/x-internet-signup',\r
- '.jfif' => 'image/jpeg',\r
- '.jnlp' => 'application/x-java-jnlp-file',\r
- '.jpe' => 'image/jpeg',\r
- '.jpeg' => 'image/jpeg',\r
- '.jpg' => 'image/jpeg',\r
- '.jtx' => 'application/x-jtx+xps',\r
- '.latex' => 'application/x-latex',\r
- '.log' => 'text/plain',\r
- '.m1v' => 'video/mpeg',\r
- '.m2v' => 'video/mpeg',\r
- '.m3u' => 'audio/x-mpegurl',\r
- '.mac' => 'image/x-macpaint',\r
- '.man' => 'application/x-troff-man',\r
- '.mda' => 'application/msaccess',\r
- '.mdb' => 'application/msaccess',\r
- '.mde' => 'application/msaccess',\r
- '.mfp' => 'application/x-shockwave-flash',\r
- '.mht' => 'message/rfc822',\r
- '.mhtml' => 'message/rfc822',\r
- '.mid' => 'audio/mid',\r
- '.midi' => 'audio/mid',\r
- '.mod' => 'video/mpeg',\r
- '.mov' => 'video/quicktime',\r
- '.mp2' => 'video/mpeg',\r
- '.mp2v' => 'video/mpeg',\r
- '.mp3' => 'audio/mpeg',\r
- '.mp4' => 'video/mp4',\r
- '.mpa' => 'video/mpeg',\r
- '.mpe' => 'video/mpeg',\r
- '.mpeg' => 'video/mpeg',\r
- '.mpf' => 'application/vnd.ms-mediapackage',\r
- '.mpg' => 'video/mpeg',\r
- '.mpv2' => 'video/mpeg',\r
- '.mqv' => 'video/quicktime',\r
- '.NMW' => 'application/nmwb',\r
- '.nws' => 'message/rfc822',\r
- '.odc' => 'text/x-ms-odc',\r
- '.ols' => 'application/vnd.ms-publisher',\r
- '.p10' => 'application/pkcs10',\r
- '.p12' => 'application/x-pkcs12',\r
- '.p7b' => 'application/x-pkcs7-certificates',\r
- '.p7c' => 'application/pkcs7-mime',\r
- '.p7m' => 'application/pkcs7-mime',\r
- '.p7r' => 'application/x-pkcs7-certreqresp',\r
- '.p7s' => 'application/pkcs7-signature',\r
- '.pct' => 'image/pict',\r
- '.pdf' => 'application/pdf',\r
- '.pdx' => 'application/vnd.adobe.pdx',\r
- '.pfx' => 'application/x-pkcs12',\r
- '.pic' => 'image/pict',\r
- '.pict' => 'image/pict',\r
- '.pinstall' => 'application/x-picasa-detect',\r
- '.pko' => 'application/vnd.ms-pki.pko',\r
- '.png' => 'image/png',\r
- '.pnt' => 'image/x-macpaint',\r
- '.pntg' => 'image/x-macpaint',\r
- '.pot' => 'application/vnd.ms-powerpoint',\r
- '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',\r
- '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',\r
- '.ppa' => 'application/vnd.ms-powerpoint',\r
- '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',\r
- '.pps' => 'application/vnd.ms-powerpoint',\r
- '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',\r
- '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',\r
- '.ppt' => 'application/vnd.ms-powerpoint',\r
- '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',\r
- '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',\r
- '.prf' => 'application/pics-rules',\r
- '.ps' => 'application/postscript',\r
- '.pub' => 'application/vnd.ms-publisher',\r
- '.pwz' => 'application/vnd.ms-powerpoint',\r
- '.py' => 'text/plain',\r
- '.pyw' => 'text/plain',\r
- '.qht' => 'text/x-html-insertion',\r
- '.qhtm' => 'text/x-html-insertion',\r
- '.qt' => 'video/quicktime',\r
- '.qti' => 'image/x-quicktime',\r
- '.qtif' => 'image/x-quicktime',\r
- '.qtl' => 'application/x-quicktimeplayer',\r
- '.rat' => 'application/rat-file',\r
- '.rmf' => 'application/vnd.adobe.rmf',\r
- '.rmi' => 'audio/mid',\r
- '.rqy' => 'text/x-ms-rqy',\r
- '.rtf' => 'application/msword',\r
- '.sct' => 'text/scriptlet',\r
- '.sd2' => 'audio/x-sd2',\r
- '.sdp' => 'application/sdp',\r
- '.shtml' => 'text/html',\r
- '.sit' => 'application/x-stuffit',\r
- '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',\r
- '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',\r
- '.slk' => 'application/vnd.ms-excel',\r
- '.snd' => 'audio/basic',\r
- '.so' => 'application/x-apachemodule',\r
- '.sol' => 'text/plain',\r
- '.sor' => 'text/plain',\r
- '.spc' => 'application/x-pkcs7-certificates',\r
- '.spl' => 'application/futuresplash',\r
- '.sst' => 'application/vnd.ms-pki.certstore',\r
- '.stl' => 'application/vnd.ms-pki.stl',\r
- '.swf' => 'application/x-shockwave-flash',\r
- '.thmx' => 'application/vnd.ms-officetheme',\r
- '.tif' => 'image/tiff',\r
- '.tiff' => 'image/tiff',\r
- '.txt' => 'text/plain',\r
- '.uls' => 'text/iuls',\r
- '.vcf' => 'text/x-vcard',\r
- '.vdx' => 'application/vnd.ms-visio.viewer',\r
- '.vsd' => 'application/vnd.ms-visio.viewer',\r
- '.vss' => 'application/vnd.ms-visio.viewer',\r
- '.vst' => 'application/vnd.ms-visio.viewer',\r
- '.vsx' => 'application/vnd.ms-visio.viewer',\r
- '.vtx' => 'application/vnd.ms-visio.viewer',\r
- '.wav' => 'audio/wav',\r
- '.wax' => 'audio/x-ms-wax',\r
- '.wbk' => 'application/msword',\r
- '.wdp' => 'image/vnd.ms-photo',\r
- '.wiz' => 'application/msword',\r
- '.wm' => 'video/x-ms-wm',\r
- '.wma' => 'audio/x-ms-wma',\r
- '.wmd' => 'application/x-ms-wmd',\r
- '.wmv' => 'video/x-ms-wmv',\r
- '.wmx' => 'video/x-ms-wmx',\r
- '.wmz' => 'application/x-ms-wmz',\r
- '.wpl' => 'application/vnd.ms-wpl',\r
- '.wsc' => 'text/scriptlet',\r
- '.wvx' => 'video/x-ms-wvx',\r
- '.xaml' => 'application/xaml+xml',\r
- '.xbap' => 'application/x-ms-xbap',\r
- '.xdp' => 'application/vnd.adobe.xdp+xml',\r
- '.xfdf' => 'application/vnd.adobe.xfdf',\r
- '.xht' => 'application/xhtml+xml',\r
- '.xhtml' => 'application/xhtml+xml',\r
- '.xla' => 'application/vnd.ms-excel',\r
- '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',\r
- '.xlk' => 'application/vnd.ms-excel',\r
- '.xll' => 'application/vnd.ms-excel',\r
- '.xlm' => 'application/vnd.ms-excel',\r
- '.xls' => 'application/vnd.ms-excel',\r
- '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',\r
- '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',\r
- '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',\r
- '.xlt' => 'application/vnd.ms-excel',\r
- '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',\r
- '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',\r
- '.xlw' => 'application/vnd.ms-excel',\r
- '.xml' => 'text/xml',\r
- '.xps' => 'application/vnd.ms-xpsdocument',\r
- '.xsl' => 'text/xml',\r
- );\r
-\r
- /** \r
- * IE versions which have been analysed to bring you this class, and for \r
- * which some substantive difference exists. These will appear as keys \r
- * in the return value of getRealMimesFromData(). The names are chosen to sort correctly.\r
- */\r
- protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' );\r
-\r
- /**\r
- * Type table with versions expanded \r
- */\r
- protected $typeTable = array();\r
-\r
- /** constructor */\r
- function __construct() {\r
- // Construct versioned type arrays from the base type array plus additions \r
- $types = $this->baseTypeTable;\r
- foreach ( $this->versions as $version ) {\r
- if ( isset( $this->addedTypes[$version] ) ) {\r
- foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {\r
- $types[$format] = array_merge( $types[$format], $addedTypes );\r
- }\r
- }\r
- $this->typeTable[$version] = $types;\r
- }\r
- }\r
-\r
- /**\r
- * Get the MIME types from getMimesFromData(), but convert the result from IE's \r
- * idiosyncratic private types into something other apps will understand.\r
- *\r
- * @param string $fileName The file name (unused at present)\r
- * @param string $chunk The first 256 bytes of the file\r
- * @param string $proposed The MIME type proposed by the server\r
- *\r
- * @return array Map of IE version to detected mime type\r
- */\r
- public function getRealMimesFromData( $fileName, $chunk, $proposed ) {\r
- $types = $this->getMimesFromData( $fileName, $chunk, $proposed );\r
- $types = array_map( array( $this, 'translateMimeType' ), $types );\r
- return $types;\r
- }\r
-\r
- /**\r
- * Translate a MIME type from IE's idiosyncratic private types into\r
- * more commonly understood type strings\r
- */\r
- public function translateMimeType( $type ) {\r
- static $table = array(\r
- 'image/pjpeg' => 'image/jpeg',\r
- 'image/x-png' => 'image/png',\r
- 'image/x-wmf' => 'application/x-msmetafile',\r
- 'image/bmp' => 'image/x-bmp',\r
- 'application/x-zip-compressed' => 'application/zip',\r
- 'application/x-compressed' => 'application/x-compress',\r
- 'application/x-gzip-compressed' => 'application/x-gzip',\r
- 'audio/mid' => 'audio/midi',\r
- );\r
- if ( isset( $table[$type] ) ) {\r
- $type = $table[$type];\r
- }\r
- return $type;\r
- }\r
-\r
- /**\r
- * Get the untranslated MIME types for all known versions\r
- *\r
- * @param string $fileName The file name (unused at present)\r
- * @param string $chunk The first 256 bytes of the file\r
- * @param string $proposed The MIME type proposed by the server\r
- *\r
- * @return array Map of IE version to detected mime type\r
- */\r
- public function getMimesFromData( $fileName, $chunk, $proposed ) {\r
- $types = array();\r
- foreach ( $this->versions as $version ) {\r
- $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );\r
- }\r
- return $types;\r
- }\r
-\r
- /**\r
- * Get the MIME type for a given named version\r
- */\r
- protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {\r
- // Strip text after a semicolon\r
- $semiPos = strpos( $proposed, ';' );\r
- if ( $semiPos !== false ) {\r
- $proposed = substr( $proposed, 0, $semiPos );\r
- }\r
-\r
- $proposedFormat = $this->getDataFormat( $version, $proposed );\r
- if ( $proposedFormat == 'unknown'\r
- && $proposed != 'multipart/mixed'\r
- && $proposed != 'multipart/x-mixed-replace' )\r
- {\r
- return $proposed;\r
- }\r
- if ( strval( $chunk ) === '' ) {\r
- return $proposed;\r
- }\r
-\r
- // Truncate chunk at 255 bytes\r
- $chunk = substr( $chunk, 0, 255 );\r
-\r
- // IE does the Check*Headers() calls last, and instead does the following image \r
- // type checks by directly looking for the magic numbers. What I do here should \r
- // have the same effect since the magic number checks are identical in both cases.\r
- $result = $this->sampleData( $version, $chunk );\r
- $sampleFound = $result['found'];\r
- $counters = $result['counters'];\r
- $binaryType = $this->checkBinaryHeaders( $version, $chunk );\r
- $textType = $this->checkTextHeaders( $version, $chunk );\r
-\r
- if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {\r
- return 'text/html';\r
- }\r
- if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {\r
- return 'image/gif';\r
- }\r
- if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )\r
- && $binaryType == 'image/pjpeg' ) \r
- {\r
- return $proposed;\r
- }\r
- // PNG check added in IE 7\r
- if ( $version >= 'ie07'\r
- && ( $proposed == 'image/x-png' || $proposed == 'image/png' )\r
- && $binaryType == 'image/x-png' )\r
- {\r
- return $proposed;\r
- }\r
-\r
- // CDF was removed in IE 7 so it won't be in $sampleFound for later versions\r
- if ( isset( $sampleFound['cdf'] ) ) {\r
- return 'application/x-cdf';\r
- }\r
-\r
- // RSS and Atom were added in IE 7 so they won't be in $sampleFound for \r
- // previous versions\r
- if ( isset( $sampleFound['rss'] ) ) {\r
- return 'application/rss+xml';\r
- }\r
- if ( isset( $sampleFound['rdf-tag'] )\r
- && isset( $sampleFound['rdf-url'] )\r
- && isset( $sampleFound['rdf-purl'] ) )\r
- {\r
- return 'application/rss+xml';\r
- }\r
- if ( isset( $sampleFound['atom'] ) ) {\r
- return 'application/atom+xml';\r
- }\r
-\r
- if ( isset( $sampleFound['xml'] ) ) {\r
- // TODO: I'm not sure under what circumstances this flag is enabled\r
- if ( strpos( $version, 'strict' ) !== false ) {\r
- if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {\r
- return 'text/xml';\r
- }\r
- } else {\r
- return 'text/xml';\r
- }\r
- }\r
- if ( isset( $sampleFound['html'] ) ) {\r
- // TODO: I'm not sure under what circumstances this flag is enabled\r
- if ( strpos( $version, 'nohtml' ) !== false ) {\r
- if ( $proposed == 'text/plain' ) {\r
- return 'text/html';\r
- }\r
- } else {\r
- return 'text/html';\r
- }\r
- }\r
- if ( isset( $sampleFound['xbm'] ) ) {\r
- return 'image/x-bitmap';\r
- }\r
- if ( isset( $sampleFound['binhex'] ) ) {\r
- return 'application/macbinhex40';\r
- }\r
- if ( isset( $sampleFound['scriptlet'] ) ) {\r
- if ( strpos( $version, 'strict' ) !== false ) {\r
- if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {\r
- return 'text/scriptlet';\r
- }\r
- } else {\r
- return 'text/scriptlet';\r
- }\r
- }\r
-\r
- // Freaky heuristics to determine if the data is text or binary\r
- // The heuristic is of course broken for non-ASCII text\r
- if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] ) \r
- < ( $counters['ctrl'] + $counters['high'] ) * 16 ) \r
- {\r
- $kindOfBinary = true;\r
- $type = $binaryType ? $binaryType : $textType;\r
- if ( $type === false ) {\r
- $type = 'application/octet-stream';\r
- }\r
- } else {\r
- $kindOfBinary = false;\r
- $type = $textType ? $textType : $binaryType;\r
- if ( $type === false ) {\r
- $type = 'text/plain';\r
- }\r
- }\r
-\r
- // Check if the output format is ambiguous\r
- // This generally means that detection failed, real types aren't ambiguous\r
- $detectedFormat = $this->getDataFormat( $version, $type );\r
- if ( $detectedFormat != 'ambiguous' ) {\r
- return $type;\r
- }\r
-\r
- if ( $proposedFormat != 'ambiguous' ) {\r
- // FormatAgreesWithData()\r
- if ( $proposedFormat == 'text' && !$kindOfBinary ) {\r
- return $proposed;\r
- }\r
- if ( $proposedFormat == 'binary' && $kindOfBinary ) {\r
- return $proposed;\r
- }\r
- if ( $proposedFormat == 'html' ) {\r
- return $proposed;\r
- }\r
- }\r
-\r
- // Find a MIME type by searching the registry for the file extension.\r
- $dotPos = strrpos( $fileName, '.' );\r
- if ( $dotPos === false ) {\r
- return $type;\r
- }\r
- $ext = substr( $fileName, $dotPos );\r
- if ( isset( $this->registry[$ext] ) ) {\r
- return $this->registry[$ext];\r
- }\r
-\r
- // TODO: If the extension has an application registered to it, IE will return \r
- // application/octet-stream. We'll skip that, so we could erroneously \r
- // return text/plain or application/x-netcdf where application/octet-stream\r
- // would be correct.\r
-\r
- return $type;\r
- }\r
-\r
- /**\r
- * Check for text headers at the start of the chunk\r
- * Confirmed same in 5 and 7.\r
- */\r
- private function checkTextHeaders( $version, $chunk ) {\r
- $chunk2 = substr( $chunk, 0, 2 );\r
- $chunk4 = substr( $chunk, 0, 4 );\r
- $chunk5 = substr( $chunk, 0, 5 );\r
- if ( $chunk4 == '%PDF' ) {\r
- return 'application/pdf';\r
- }\r
- if ( $chunk2 == '%!' ) {\r
- return 'application/postscript';\r
- }\r
- if ( $chunk5 == '{\\rtf' ) {\r
- return 'text/richtext';\r
- }\r
- if ( $chunk5 == 'begin' ) {\r
- return 'application/base64';\r
- }\r
- return false;\r
- }\r
-\r
- /**\r
- * Check for binary headers at the start of the chunk\r
- * Confirmed same in 5 and 7.\r
- */\r
- private function checkBinaryHeaders( $version, $chunk ) {\r
- $chunk2 = substr( $chunk, 0, 2 );\r
- $chunk3 = substr( $chunk, 0, 3 );\r
- $chunk4 = substr( $chunk, 0, 4 );\r
- $chunk5 = substr( $chunk, 0, 5 );\r
- $chunk8 = substr( $chunk, 0, 8 );\r
- if ( $chunk5 == 'GIF87' || $chunk5 == 'GIF89' ) {\r
- return 'image/gif';\r
- }\r
- if ( $chunk2 == "\xff\xd8" ) {\r
- return 'image/pjpeg'; // actually plain JPEG but this is what IE returns\r
- }\r
-\r
- if ( $chunk2 == 'BM' \r
- && substr( $chunk, 6, 2 ) == "\000\000"\r
- && substr( $chunk, 8, 2 ) != "\000\000" )\r
- {\r
- return 'image/bmp'; // another non-standard MIME\r
- }\r
- if ( $chunk4 == 'RIFF' \r
- && substr( $chunk, 8, 4 ) == 'WAVE' )\r
- {\r
- return 'audio/wav';\r
- }\r
- // These were integer literals in IE\r
- // Perhaps the author was not sure what the target endianness was\r
- if ( $chunk4 == ".sd\000"\r
- || $chunk4 == ".snd"\r
- || $chunk4 == "\000ds."\r
- || $chunk4 == "dns." )\r
- {\r
- return 'audio/basic';\r
- }\r
- if ( $chunk3 == "MM\000" ) {\r
- return 'image/tiff';\r
- }\r
- if ( $chunk2 == 'MZ' ) {\r
- return 'application/x-msdownload';\r
- }\r
- if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {\r
- return 'image/x-png'; // [sic]\r
- }\r
- if ( strlen( $chunk ) >= 5 ) {\r
- $byte2 = ord( $chunk[2] );\r
- $byte4 = ord( $chunk[4] );\r
- if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {\r
- return 'image/x-jg';\r
- }\r
- }\r
- // More endian confusion?\r
- if ( $chunk4 == 'MROF' ) {\r
- return 'audio/x-aiff';\r
- }\r
- $chunk4_8 = substr( $chunk, 8, 4 );\r
- if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {\r
- return 'audio/x-aiff';\r
- }\r
- if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {\r
- return 'video/avi';\r
- }\r
- if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {\r
- return 'video/mpeg';\r
- }\r
- if ( $chunk4 == "\001\000\000\000"\r
- && substr( $chunk, 40, 4 ) == ' EMF' )\r
- {\r
- return 'image/x-emf';\r
- }\r
- if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {\r
- return 'image/x-wmf';\r
- }\r
- if ( $chunk4 == "\xca\xfe\xba\xbe" ) {\r
- return 'application/java';\r
- }\r
- if ( $chunk2 == 'PK' ) {\r
- return 'application/x-zip-compressed';\r
- }\r
- if ( $chunk2 == "\x1f\x9d" ) {\r
- return 'application/x-compressed';\r
- }\r
- if ( $chunk2 == "\x1f\x8b" ) {\r
- return 'application/x-gzip-compressed';\r
- }\r
- // Skip redundant check for ZIP\r
- if ( $chunk5 == "MThd\000" ) {\r
- return 'audio/mid';\r
- }\r
- if ( $chunk4 == '%PDF' ) {\r
- return 'application/pdf';\r
- }\r
- return false;\r
- }\r
-\r
- /**\r
- * Do heuristic checks on the bulk of the data sample.\r
- * Search for HTML tags.\r
- */\r
- protected function sampleData( $version, $chunk ) {\r
- $found = array();\r
- $counters = array(\r
- 'ctrl' => 0,\r
- 'high' => 0,\r
- 'low' => 0,\r
- 'lf' => 0,\r
- 'cr' => 0,\r
- 'ff' => 0\r
- );\r
- $htmlTags = array(\r
- 'html',\r
- 'head',\r
- 'title',\r
- 'body',\r
- 'script',\r
- 'a href',\r
- 'pre',\r
- 'img',\r
- 'plaintext',\r
- 'table'\r
- );\r
- $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';\r
- $rdfPurl = 'http://purl.org/rss/1.0/';\r
- $xbmMagic1 = '#define';\r
- $xbmMagic2 = '_width';\r
- $xbmMagic3 = '_bits';\r
- $binhexMagic = 'converted with BinHex';\r
-\r
- for ( $offset = 0; $offset < strlen( $chunk ); $offset++ ) {\r
- $curChar = $chunk[$offset];\r
- if ( $curChar == "\x0a" ) {\r
- $counters['lf']++;\r
- continue;\r
- } elseif ( $curChar == "\x0d" ) {\r
- $counters['cr']++;\r
- continue;\r
- } elseif ( $curChar == "\x0c" ) {\r
- $counters['ff']++;\r
- continue;\r
- } elseif ( $curChar == "\t" ) {\r
- $counters['low']++;\r
- continue;\r
- } elseif ( ord( $curChar ) < 32 ) {\r
- $counters['ctrl']++;\r
- continue;\r
- } elseif ( ord( $curChar ) >= 128 ) {\r
- $counters['high']++;\r
- continue;\r
- }\r
-\r
- $counters['low']++;\r
- if ( $curChar == '<' ) {\r
- // XML\r
- $remainder = substr( $chunk, $offset + 1 );\r
- if ( !strncasecmp( $remainder, '?XML', 4 ) ) {\r
- $nextChar = substr( $chunk, $offset + 5, 1 );\r
- if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {\r
- $found['xml'] = true;\r
- }\r
- }\r
- // Scriptlet (JSP)\r
- if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {\r
- $found['scriptlet'] = true;\r
- break;\r
- }\r
- // HTML\r
- foreach ( $htmlTags as $tag ) {\r
- if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {\r
- $found['html'] = true;\r
- }\r
- }\r
- // Skip broken check for additional tags (HR etc.)\r
-\r
- // CHANNEL replaced by RSS, RDF and FEED in IE 7\r
- if ( $version < 'ie07' ) {\r
- if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {\r
- $found['cdf'] = true;\r
- }\r
- } else {\r
- // RSS\r
- if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {\r
- $found['rss'] = true;\r
- break; // return from SampleData\r
- }\r
- if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {\r
- $found['rdf-tag'] = true;\r
- // no break\r
- }\r
- if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {\r
- $found['atom'] = true;\r
- break;\r
- }\r
- }\r
- continue;\r
- }\r
- // Skip broken check for -->\r
-\r
- // RSS URL checks\r
- // For some reason both URLs must appear before it is recognised\r
- $remainder = substr( $chunk, $offset );\r
- if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {\r
- $found['rdf-url'] = true;\r
- if ( isset( $found['rdf-tag'] )\r
- && isset( $found['rdf-purl'] ) ) // [sic]\r
- {\r
- break;\r
- }\r
- continue;\r
- }\r
-\r
- if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {\r
- if ( isset( $found['rdf-tag'] ) \r
- && isset( $found['rdf-url'] ) ) // [sic]\r
- {\r
- break;\r
- }\r
- continue;\r
- }\r
-\r
- // XBM checks\r
- if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {\r
- $found['xbm1'] = true;\r
- continue;\r
- }\r
- if ( $curChar == '_' ) {\r
- if ( isset( $found['xbm2'] ) ) {\r
- if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {\r
- $found['xbm'] = true;\r
- break;\r
- }\r
- } elseif ( isset( $found['xbm1'] ) ) {\r
- if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {\r
- $found['xbm2'] = true;\r
- }\r
- }\r
- }\r
-\r
- // BinHex\r
- if ( !strncasecmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {\r
- $found['binhex'] = true;\r
- }\r
- }\r
- return array( 'found' => $found, 'counters' => $counters );\r
- }\r
-\r
- protected function getDataFormat( $version, $type ) {\r
- $types = $this->typeTable[$version];\r
- if ( $type == '(null)' || strval( $type ) === '' ) {\r
- return 'ambiguous';\r
- }\r
- foreach ( $types as $format => $list ) {\r
- if ( in_array( $type, $list ) ) {\r
- return $format;\r
- }\r
- }\r
- return 'unknown';\r
- }\r
-}\r
-\r
+<?php
+
+/**
+ * This class simulates Microsoft Internet Explorer's terribly broken and
+ * insecure MIME type detection algorithm. It can be used to check web uploads
+ * with an apparently safe type, to see if IE will reinterpret them to produce
+ * something dangerous.
+ *
+ * It is full of bugs and strange design choices should not under any
+ * circumstances be used to determine a MIME type to present to a user or
+ * client. (Apple Safari developers, this means you too.)
+ *
+ * This class is based on a disassembly of IE 5.0, 6.0 and 7.0. Although I have
+ * attempted to ensure that this code works in exactly the same way as Internet
+ * Explorer, it does not share any source code, or creative choices such as
+ * variable names, thus I (Tim Starling) claim copyright on it.
+ *
+ * It may be redistributed without restriction. To aid reuse, this class does
+ * not depend on any MediaWiki module.
+ */
+class IEContentAnalyzer {
+ /**
+ * Relevant data taken from the type table in IE 5
+ */
+ protected $baseTypeTable = array(
+ 'ambiguous' /*1*/ => array(
+ 'text/plain',
+ 'application/octet-stream',
+ 'application/x-netcdf', // [sic]
+ ),
+ 'text' /*3*/ => array(
+ 'text/richtext', 'image/x-bitmap', 'application/postscript', 'application/base64',
+ 'application/macbinhex40', 'application/x-cdf', 'text/scriptlet'
+ ),
+ 'binary' /*4*/ => array(
+ 'application/pdf', 'audio/x-aiff', 'audio/basic', 'audio/wav', 'image/gif',
+ 'image/pjpeg', 'image/jpeg', 'image/tiff', 'image/x-png', 'image/png', 'image/bmp',
+ 'image/x-jg', 'image/x-art', 'image/x-emf', 'image/x-wmf', 'video/avi',
+ 'video/x-msvideo', 'video/mpeg', 'application/x-compressed',
+ 'application/x-zip-compressed', 'application/x-gzip-compressed', 'application/java',
+ 'application/x-msdownload'
+ ),
+ 'html' /*5*/ => array( 'text/html' ),
+ );
+
+ /**
+ * Changes to the type table in later versions of IE
+ */
+ protected $addedTypes = array(
+ 'ie07' => array(
+ 'text' => array( 'text/xml', 'application/xml' )
+ ),
+ );
+
+ /**
+ * An approximation of the "Content Type" values in HKEY_CLASSES_ROOT in a
+ * typical Windows installation.
+ *
+ * Used for extension to MIME type mapping if detection fails.
+ */
+ protected $registry = array(
+ '.323' => 'text/h323',
+ '.3g2' => 'video/3gpp2',
+ '.3gp' => 'video/3gpp',
+ '.3gp2' => 'video/3gpp2',
+ '.3gpp' => 'video/3gpp',
+ '.aac' => 'audio/aac',
+ '.ac3' => 'audio/ac3',
+ '.accda' => 'application/msaccess',
+ '.accdb' => 'application/msaccess',
+ '.accdc' => 'application/msaccess',
+ '.accde' => 'application/msaccess',
+ '.accdr' => 'application/msaccess',
+ '.accdt' => 'application/msaccess',
+ '.ade' => 'application/msaccess',
+ '.adp' => 'application/msaccess',
+ '.adts' => 'audio/aac',
+ '.ai' => 'application/postscript',
+ '.aif' => 'audio/aiff',
+ '.aifc' => 'audio/aiff',
+ '.aiff' => 'audio/aiff',
+ '.amc' => 'application/x-mpeg',
+ '.application' => 'application/x-ms-application',
+ '.asf' => 'video/x-ms-asf',
+ '.asx' => 'video/x-ms-asf',
+ '.au' => 'audio/basic',
+ '.avi' => 'video/avi',
+ '.bmp' => 'image/bmp',
+ '.caf' => 'audio/x-caf',
+ '.cat' => 'application/vnd.ms-pki.seccat',
+ '.cbo' => 'application/sha',
+ '.cdda' => 'audio/aiff',
+ '.cer' => 'application/x-x509-ca-cert',
+ '.conf' => 'text/plain',
+ '.crl' => 'application/pkix-crl',
+ '.crt' => 'application/x-x509-ca-cert',
+ '.css' => 'text/css',
+ '.csv' => 'application/vnd.ms-excel',
+ '.der' => 'application/x-x509-ca-cert',
+ '.dib' => 'image/bmp',
+ '.dif' => 'video/x-dv',
+ '.dll' => 'application/x-msdownload',
+ '.doc' => 'application/msword',
+ '.docm' => 'application/vnd.ms-word.document.macroEnabled.12',
+ '.docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+ '.dot' => 'application/msword',
+ '.dotm' => 'application/vnd.ms-word.template.macroEnabled.12',
+ '.dotx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
+ '.dv' => 'video/x-dv',
+ '.dwfx' => 'model/vnd.dwfx+xps',
+ '.edn' => 'application/vnd.adobe.edn',
+ '.eml' => 'message/rfc822',
+ '.eps' => 'application/postscript',
+ '.etd' => 'application/x-ebx',
+ '.exe' => 'application/x-msdownload',
+ '.fdf' => 'application/vnd.fdf',
+ '.fif' => 'application/fractals',
+ '.gif' => 'image/gif',
+ '.gsm' => 'audio/x-gsm',
+ '.hqx' => 'application/mac-binhex40',
+ '.hta' => 'application/hta',
+ '.htc' => 'text/x-component',
+ '.htm' => 'text/html',
+ '.html' => 'text/html',
+ '.htt' => 'text/webviewhtml',
+ '.hxa' => 'application/xml',
+ '.hxc' => 'application/xml',
+ '.hxd' => 'application/octet-stream',
+ '.hxe' => 'application/xml',
+ '.hxf' => 'application/xml',
+ '.hxh' => 'application/octet-stream',
+ '.hxi' => 'application/octet-stream',
+ '.hxk' => 'application/xml',
+ '.hxq' => 'application/octet-stream',
+ '.hxr' => 'application/octet-stream',
+ '.hxs' => 'application/octet-stream',
+ '.hxt' => 'application/xml',
+ '.hxv' => 'application/xml',
+ '.hxw' => 'application/octet-stream',
+ '.ico' => 'image/x-icon',
+ '.iii' => 'application/x-iphone',
+ '.ins' => 'application/x-internet-signup',
+ '.iqy' => 'text/x-ms-iqy',
+ '.isp' => 'application/x-internet-signup',
+ '.jfif' => 'image/jpeg',
+ '.jnlp' => 'application/x-java-jnlp-file',
+ '.jpe' => 'image/jpeg',
+ '.jpeg' => 'image/jpeg',
+ '.jpg' => 'image/jpeg',
+ '.jtx' => 'application/x-jtx+xps',
+ '.latex' => 'application/x-latex',
+ '.log' => 'text/plain',
+ '.m1v' => 'video/mpeg',
+ '.m2v' => 'video/mpeg',
+ '.m3u' => 'audio/x-mpegurl',
+ '.mac' => 'image/x-macpaint',
+ '.man' => 'application/x-troff-man',
+ '.mda' => 'application/msaccess',
+ '.mdb' => 'application/msaccess',
+ '.mde' => 'application/msaccess',
+ '.mfp' => 'application/x-shockwave-flash',
+ '.mht' => 'message/rfc822',
+ '.mhtml' => 'message/rfc822',
+ '.mid' => 'audio/mid',
+ '.midi' => 'audio/mid',
+ '.mod' => 'video/mpeg',
+ '.mov' => 'video/quicktime',
+ '.mp2' => 'video/mpeg',
+ '.mp2v' => 'video/mpeg',
+ '.mp3' => 'audio/mpeg',
+ '.mp4' => 'video/mp4',
+ '.mpa' => 'video/mpeg',
+ '.mpe' => 'video/mpeg',
+ '.mpeg' => 'video/mpeg',
+ '.mpf' => 'application/vnd.ms-mediapackage',
+ '.mpg' => 'video/mpeg',
+ '.mpv2' => 'video/mpeg',
+ '.mqv' => 'video/quicktime',
+ '.NMW' => 'application/nmwb',
+ '.nws' => 'message/rfc822',
+ '.odc' => 'text/x-ms-odc',
+ '.ols' => 'application/vnd.ms-publisher',
+ '.p10' => 'application/pkcs10',
+ '.p12' => 'application/x-pkcs12',
+ '.p7b' => 'application/x-pkcs7-certificates',
+ '.p7c' => 'application/pkcs7-mime',
+ '.p7m' => 'application/pkcs7-mime',
+ '.p7r' => 'application/x-pkcs7-certreqresp',
+ '.p7s' => 'application/pkcs7-signature',
+ '.pct' => 'image/pict',
+ '.pdf' => 'application/pdf',
+ '.pdx' => 'application/vnd.adobe.pdx',
+ '.pfx' => 'application/x-pkcs12',
+ '.pic' => 'image/pict',
+ '.pict' => 'image/pict',
+ '.pinstall' => 'application/x-picasa-detect',
+ '.pko' => 'application/vnd.ms-pki.pko',
+ '.png' => 'image/png',
+ '.pnt' => 'image/x-macpaint',
+ '.pntg' => 'image/x-macpaint',
+ '.pot' => 'application/vnd.ms-powerpoint',
+ '.potm' => 'application/vnd.ms-powerpoint.template.macroEnabled.12',
+ '.potx' => 'application/vnd.openxmlformats-officedocument.presentationml.template',
+ '.ppa' => 'application/vnd.ms-powerpoint',
+ '.ppam' => 'application/vnd.ms-powerpoint.addin.macroEnabled.12',
+ '.pps' => 'application/vnd.ms-powerpoint',
+ '.ppsm' => 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12',
+ '.ppsx' => 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
+ '.ppt' => 'application/vnd.ms-powerpoint',
+ '.pptm' => 'application/vnd.ms-powerpoint.presentation.macroEnabled.12',
+ '.pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+ '.prf' => 'application/pics-rules',
+ '.ps' => 'application/postscript',
+ '.pub' => 'application/vnd.ms-publisher',
+ '.pwz' => 'application/vnd.ms-powerpoint',
+ '.py' => 'text/plain',
+ '.pyw' => 'text/plain',
+ '.qht' => 'text/x-html-insertion',
+ '.qhtm' => 'text/x-html-insertion',
+ '.qt' => 'video/quicktime',
+ '.qti' => 'image/x-quicktime',
+ '.qtif' => 'image/x-quicktime',
+ '.qtl' => 'application/x-quicktimeplayer',
+ '.rat' => 'application/rat-file',
+ '.rmf' => 'application/vnd.adobe.rmf',
+ '.rmi' => 'audio/mid',
+ '.rqy' => 'text/x-ms-rqy',
+ '.rtf' => 'application/msword',
+ '.sct' => 'text/scriptlet',
+ '.sd2' => 'audio/x-sd2',
+ '.sdp' => 'application/sdp',
+ '.shtml' => 'text/html',
+ '.sit' => 'application/x-stuffit',
+ '.sldm' => 'application/vnd.ms-powerpoint.slide.macroEnabled.12',
+ '.sldx' => 'application/vnd.openxmlformats-officedocument.presentationml.slide',
+ '.slk' => 'application/vnd.ms-excel',
+ '.snd' => 'audio/basic',
+ '.so' => 'application/x-apachemodule',
+ '.sol' => 'text/plain',
+ '.sor' => 'text/plain',
+ '.spc' => 'application/x-pkcs7-certificates',
+ '.spl' => 'application/futuresplash',
+ '.sst' => 'application/vnd.ms-pki.certstore',
+ '.stl' => 'application/vnd.ms-pki.stl',
+ '.swf' => 'application/x-shockwave-flash',
+ '.thmx' => 'application/vnd.ms-officetheme',
+ '.tif' => 'image/tiff',
+ '.tiff' => 'image/tiff',
+ '.txt' => 'text/plain',
+ '.uls' => 'text/iuls',
+ '.vcf' => 'text/x-vcard',
+ '.vdx' => 'application/vnd.ms-visio.viewer',
+ '.vsd' => 'application/vnd.ms-visio.viewer',
+ '.vss' => 'application/vnd.ms-visio.viewer',
+ '.vst' => 'application/vnd.ms-visio.viewer',
+ '.vsx' => 'application/vnd.ms-visio.viewer',
+ '.vtx' => 'application/vnd.ms-visio.viewer',
+ '.wav' => 'audio/wav',
+ '.wax' => 'audio/x-ms-wax',
+ '.wbk' => 'application/msword',
+ '.wdp' => 'image/vnd.ms-photo',
+ '.wiz' => 'application/msword',
+ '.wm' => 'video/x-ms-wm',
+ '.wma' => 'audio/x-ms-wma',
+ '.wmd' => 'application/x-ms-wmd',
+ '.wmv' => 'video/x-ms-wmv',
+ '.wmx' => 'video/x-ms-wmx',
+ '.wmz' => 'application/x-ms-wmz',
+ '.wpl' => 'application/vnd.ms-wpl',
+ '.wsc' => 'text/scriptlet',
+ '.wvx' => 'video/x-ms-wvx',
+ '.xaml' => 'application/xaml+xml',
+ '.xbap' => 'application/x-ms-xbap',
+ '.xdp' => 'application/vnd.adobe.xdp+xml',
+ '.xfdf' => 'application/vnd.adobe.xfdf',
+ '.xht' => 'application/xhtml+xml',
+ '.xhtml' => 'application/xhtml+xml',
+ '.xla' => 'application/vnd.ms-excel',
+ '.xlam' => 'application/vnd.ms-excel.addin.macroEnabled.12',
+ '.xlk' => 'application/vnd.ms-excel',
+ '.xll' => 'application/vnd.ms-excel',
+ '.xlm' => 'application/vnd.ms-excel',
+ '.xls' => 'application/vnd.ms-excel',
+ '.xlsb' => 'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
+ '.xlsm' => 'application/vnd.ms-excel.sheet.macroEnabled.12',
+ '.xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+ '.xlt' => 'application/vnd.ms-excel',
+ '.xltm' => 'application/vnd.ms-excel.template.macroEnabled.12',
+ '.xltx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.template',
+ '.xlw' => 'application/vnd.ms-excel',
+ '.xml' => 'text/xml',
+ '.xps' => 'application/vnd.ms-xpsdocument',
+ '.xsl' => 'text/xml',
+ );
+
+ /**
+ * IE versions which have been analysed to bring you this class, and for
+ * which some substantive difference exists. These will appear as keys
+ * in the return value of getRealMimesFromData(). The names are chosen to sort correctly.
+ */
+ protected $versions = array( 'ie05', 'ie06', 'ie07', 'ie07.strict', 'ie07.nohtml' );
+
+ /**
+ * Type table with versions expanded
+ */
+ protected $typeTable = array();
+
+ /** constructor */
+ function __construct() {
+ // Construct versioned type arrays from the base type array plus additions
+ $types = $this->baseTypeTable;
+ foreach ( $this->versions as $version ) {
+ if ( isset( $this->addedTypes[$version] ) ) {
+ foreach ( $this->addedTypes[$version] as $format => $addedTypes ) {
+ $types[$format] = array_merge( $types[$format], $addedTypes );
+ }
+ }
+ $this->typeTable[$version] = $types;
+ }
+ }
+
+ /**
+ * Get the MIME types from getMimesFromData(), but convert the result from IE's
+ * idiosyncratic private types into something other apps will understand.
+ *
+ * @param string $fileName The file name (unused at present)
+ * @param string $chunk The first 256 bytes of the file
+ * @param string $proposed The MIME type proposed by the server
+ *
+ * @return array Map of IE version to detected mime type
+ */
+ public function getRealMimesFromData( $fileName, $chunk, $proposed ) {
+ $types = $this->getMimesFromData( $fileName, $chunk, $proposed );
+ $types = array_map( array( $this, 'translateMimeType' ), $types );
+ return $types;
+ }
+
+ /**
+ * Translate a MIME type from IE's idiosyncratic private types into
+ * more commonly understood type strings
+ */
+ public function translateMimeType( $type ) {
+ static $table = array(
+ 'image/pjpeg' => 'image/jpeg',
+ 'image/x-png' => 'image/png',
+ 'image/x-wmf' => 'application/x-msmetafile',
+ 'image/bmp' => 'image/x-bmp',
+ 'application/x-zip-compressed' => 'application/zip',
+ 'application/x-compressed' => 'application/x-compress',
+ 'application/x-gzip-compressed' => 'application/x-gzip',
+ 'audio/mid' => 'audio/midi',
+ );
+ if ( isset( $table[$type] ) ) {
+ $type = $table[$type];
+ }
+ return $type;
+ }
+
+ /**
+ * Get the untranslated MIME types for all known versions
+ *
+ * @param string $fileName The file name (unused at present)
+ * @param string $chunk The first 256 bytes of the file
+ * @param string $proposed The MIME type proposed by the server
+ *
+ * @return array Map of IE version to detected mime type
+ */
+ public function getMimesFromData( $fileName, $chunk, $proposed ) {
+ $types = array();
+ foreach ( $this->versions as $version ) {
+ $types[$version] = $this->getMimeTypeForVersion( $version, $fileName, $chunk, $proposed );
+ }
+ return $types;
+ }
+
+ /**
+ * Get the MIME type for a given named version
+ */
+ protected function getMimeTypeForVersion( $version, $fileName, $chunk, $proposed ) {
+ // Strip text after a semicolon
+ $semiPos = strpos( $proposed, ';' );
+ if ( $semiPos !== false ) {
+ $proposed = substr( $proposed, 0, $semiPos );
+ }
+
+ $proposedFormat = $this->getDataFormat( $version, $proposed );
+ if ( $proposedFormat == 'unknown'
+ && $proposed != 'multipart/mixed'
+ && $proposed != 'multipart/x-mixed-replace' )
+ {
+ return $proposed;
+ }
+ if ( strval( $chunk ) === '' ) {
+ return $proposed;
+ }
+
+ // Truncate chunk at 255 bytes
+ $chunk = substr( $chunk, 0, 255 );
+
+ // IE does the Check*Headers() calls last, and instead does the following image
+ // type checks by directly looking for the magic numbers. What I do here should
+ // have the same effect since the magic number checks are identical in both cases.
+ $result = $this->sampleData( $version, $chunk );
+ $sampleFound = $result['found'];
+ $counters = $result['counters'];
+ $binaryType = $this->checkBinaryHeaders( $version, $chunk );
+ $textType = $this->checkTextHeaders( $version, $chunk );
+
+ if ( $proposed == 'text/html' && isset( $sampleFound['html'] ) ) {
+ return 'text/html';
+ }
+ if ( $proposed == 'image/gif' && $binaryType == 'image/gif' ) {
+ return 'image/gif';
+ }
+ if ( ( $proposed == 'image/pjpeg' || $proposed == 'image/jpeg' )
+ && $binaryType == 'image/pjpeg' )
+ {
+ return $proposed;
+ }
+ // PNG check added in IE 7
+ if ( $version >= 'ie07'
+ && ( $proposed == 'image/x-png' || $proposed == 'image/png' )
+ && $binaryType == 'image/x-png' )
+ {
+ return $proposed;
+ }
+
+ // CDF was removed in IE 7 so it won't be in $sampleFound for later versions
+ if ( isset( $sampleFound['cdf'] ) ) {
+ return 'application/x-cdf';
+ }
+
+ // RSS and Atom were added in IE 7 so they won't be in $sampleFound for
+ // previous versions
+ if ( isset( $sampleFound['rss'] ) ) {
+ return 'application/rss+xml';
+ }
+ if ( isset( $sampleFound['rdf-tag'] )
+ && isset( $sampleFound['rdf-url'] )
+ && isset( $sampleFound['rdf-purl'] ) )
+ {
+ return 'application/rss+xml';
+ }
+ if ( isset( $sampleFound['atom'] ) ) {
+ return 'application/atom+xml';
+ }
+
+ if ( isset( $sampleFound['xml'] ) ) {
+ // TODO: I'm not sure under what circumstances this flag is enabled
+ if ( strpos( $version, 'strict' ) !== false ) {
+ if ( $proposed == 'text/html' || $proposed == 'text/xml' ) {
+ return 'text/xml';
+ }
+ } else {
+ return 'text/xml';
+ }
+ }
+ if ( isset( $sampleFound['html'] ) ) {
+ // TODO: I'm not sure under what circumstances this flag is enabled
+ if ( strpos( $version, 'nohtml' ) !== false ) {
+ if ( $proposed == 'text/plain' ) {
+ return 'text/html';
+ }
+ } else {
+ return 'text/html';
+ }
+ }
+ if ( isset( $sampleFound['xbm'] ) ) {
+ return 'image/x-bitmap';
+ }
+ if ( isset( $sampleFound['binhex'] ) ) {
+ return 'application/macbinhex40';
+ }
+ if ( isset( $sampleFound['scriptlet'] ) ) {
+ if ( strpos( $version, 'strict' ) !== false ) {
+ if ( $proposed == 'text/plain' || $proposed == 'text/scriptlet' ) {
+ return 'text/scriptlet';
+ }
+ } else {
+ return 'text/scriptlet';
+ }
+ }
+
+ // Freaky heuristics to determine if the data is text or binary
+ // The heuristic is of course broken for non-ASCII text
+ if ( $counters['ctrl'] != 0 && ( $counters['ff'] + $counters['low'] )
+ < ( $counters['ctrl'] + $counters['high'] ) * 16 )
+ {
+ $kindOfBinary = true;
+ $type = $binaryType ? $binaryType : $textType;
+ if ( $type === false ) {
+ $type = 'application/octet-stream';
+ }
+ } else {
+ $kindOfBinary = false;
+ $type = $textType ? $textType : $binaryType;
+ if ( $type === false ) {
+ $type = 'text/plain';
+ }
+ }
+
+ // Check if the output format is ambiguous
+ // This generally means that detection failed, real types aren't ambiguous
+ $detectedFormat = $this->getDataFormat( $version, $type );
+ if ( $detectedFormat != 'ambiguous' ) {
+ return $type;
+ }
+
+ if ( $proposedFormat != 'ambiguous' ) {
+ // FormatAgreesWithData()
+ if ( $proposedFormat == 'text' && !$kindOfBinary ) {
+ return $proposed;
+ }
+ if ( $proposedFormat == 'binary' && $kindOfBinary ) {
+ return $proposed;
+ }
+ if ( $proposedFormat == 'html' ) {
+ return $proposed;
+ }
+ }
+
+ // Find a MIME type by searching the registry for the file extension.
+ $dotPos = strrpos( $fileName, '.' );
+ if ( $dotPos === false ) {
+ return $type;
+ }
+ $ext = substr( $fileName, $dotPos );
+ if ( isset( $this->registry[$ext] ) ) {
+ return $this->registry[$ext];
+ }
+
+ // TODO: If the extension has an application registered to it, IE will return
+ // application/octet-stream. We'll skip that, so we could erroneously
+ // return text/plain or application/x-netcdf where application/octet-stream
+ // would be correct.
+
+ return $type;
+ }
+
+ /**
+ * Check for text headers at the start of the chunk
+ * Confirmed same in 5 and 7.
+ */
+ private function checkTextHeaders( $version, $chunk ) {
+ $chunk2 = substr( $chunk, 0, 2 );
+ $chunk4 = substr( $chunk, 0, 4 );
+ $chunk5 = substr( $chunk, 0, 5 );
+ if ( $chunk4 == '%PDF' ) {
+ return 'application/pdf';
+ }
+ if ( $chunk2 == '%!' ) {
+ return 'application/postscript';
+ }
+ if ( $chunk5 == '{\\rtf' ) {
+ return 'text/richtext';
+ }
+ if ( $chunk5 == 'begin' ) {
+ return 'application/base64';
+ }
+ return false;
+ }
+
+ /**
+ * Check for binary headers at the start of the chunk
+ * Confirmed same in 5 and 7.
+ */
+ private function checkBinaryHeaders( $version, $chunk ) {
+ $chunk2 = substr( $chunk, 0, 2 );
+ $chunk3 = substr( $chunk, 0, 3 );
+ $chunk4 = substr( $chunk, 0, 4 );
+ $chunk5 = substr( $chunk, 0, 5 );
+ $chunk8 = substr( $chunk, 0, 8 );
+ if ( $chunk5 == 'GIF87' || $chunk5 == 'GIF89' ) {
+ return 'image/gif';
+ }
+ if ( $chunk2 == "\xff\xd8" ) {
+ return 'image/pjpeg'; // actually plain JPEG but this is what IE returns
+ }
+
+ if ( $chunk2 == 'BM'
+ && substr( $chunk, 6, 2 ) == "\000\000"
+ && substr( $chunk, 8, 2 ) != "\000\000" )
+ {
+ return 'image/bmp'; // another non-standard MIME
+ }
+ if ( $chunk4 == 'RIFF'
+ && substr( $chunk, 8, 4 ) == 'WAVE' )
+ {
+ return 'audio/wav';
+ }
+ // These were integer literals in IE
+ // Perhaps the author was not sure what the target endianness was
+ if ( $chunk4 == ".sd\000"
+ || $chunk4 == ".snd"
+ || $chunk4 == "\000ds."
+ || $chunk4 == "dns." )
+ {
+ return 'audio/basic';
+ }
+ if ( $chunk3 == "MM\000" ) {
+ return 'image/tiff';
+ }
+ if ( $chunk2 == 'MZ' ) {
+ return 'application/x-msdownload';
+ }
+ if ( $chunk8 == "\x89PNG\x0d\x0a\x1a\x0a" ) {
+ return 'image/x-png'; // [sic]
+ }
+ if ( strlen( $chunk ) >= 5 ) {
+ $byte2 = ord( $chunk[2] );
+ $byte4 = ord( $chunk[4] );
+ if ( $byte2 >= 3 && $byte2 <= 31 && $byte4 == 0 && $chunk2 == 'JG' ) {
+ return 'image/x-jg';
+ }
+ }
+ // More endian confusion?
+ if ( $chunk4 == 'MROF' ) {
+ return 'audio/x-aiff';
+ }
+ $chunk4_8 = substr( $chunk, 8, 4 );
+ if ( $chunk4 == 'FORM' && ( $chunk4_8 == 'AIFF' || $chunk4_8 == 'AIFC' ) ) {
+ return 'audio/x-aiff';
+ }
+ if ( $chunk4 == 'RIFF' && $chunk4_8 == 'AVI ' ) {
+ return 'video/avi';
+ }
+ if ( $chunk4 == "\x00\x00\x01\xb3" || $chunk4 == "\x00\x00\x01\xba" ) {
+ return 'video/mpeg';
+ }
+ if ( $chunk4 == "\001\000\000\000"
+ && substr( $chunk, 40, 4 ) == ' EMF' )
+ {
+ return 'image/x-emf';
+ }
+ if ( $chunk4 == "\xd7\xcd\xc6\x9a" ) {
+ return 'image/x-wmf';
+ }
+ if ( $chunk4 == "\xca\xfe\xba\xbe" ) {
+ return 'application/java';
+ }
+ if ( $chunk2 == 'PK' ) {
+ return 'application/x-zip-compressed';
+ }
+ if ( $chunk2 == "\x1f\x9d" ) {
+ return 'application/x-compressed';
+ }
+ if ( $chunk2 == "\x1f\x8b" ) {
+ return 'application/x-gzip-compressed';
+ }
+ // Skip redundant check for ZIP
+ if ( $chunk5 == "MThd\000" ) {
+ return 'audio/mid';
+ }
+ if ( $chunk4 == '%PDF' ) {
+ return 'application/pdf';
+ }
+ return false;
+ }
+
+ /**
+ * Do heuristic checks on the bulk of the data sample.
+ * Search for HTML tags.
+ */
+ protected function sampleData( $version, $chunk ) {
+ $found = array();
+ $counters = array(
+ 'ctrl' => 0,
+ 'high' => 0,
+ 'low' => 0,
+ 'lf' => 0,
+ 'cr' => 0,
+ 'ff' => 0
+ );
+ $htmlTags = array(
+ 'html',
+ 'head',
+ 'title',
+ 'body',
+ 'script',
+ 'a href',
+ 'pre',
+ 'img',
+ 'plaintext',
+ 'table'
+ );
+ $rdfUrl = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
+ $rdfPurl = 'http://purl.org/rss/1.0/';
+ $xbmMagic1 = '#define';
+ $xbmMagic2 = '_width';
+ $xbmMagic3 = '_bits';
+ $binhexMagic = 'converted with BinHex';
+
+ for ( $offset = 0; $offset < strlen( $chunk ); $offset++ ) {
+ $curChar = $chunk[$offset];
+ if ( $curChar == "\x0a" ) {
+ $counters['lf']++;
+ continue;
+ } elseif ( $curChar == "\x0d" ) {
+ $counters['cr']++;
+ continue;
+ } elseif ( $curChar == "\x0c" ) {
+ $counters['ff']++;
+ continue;
+ } elseif ( $curChar == "\t" ) {
+ $counters['low']++;
+ continue;
+ } elseif ( ord( $curChar ) < 32 ) {
+ $counters['ctrl']++;
+ continue;
+ } elseif ( ord( $curChar ) >= 128 ) {
+ $counters['high']++;
+ continue;
+ }
+
+ $counters['low']++;
+ if ( $curChar == '<' ) {
+ // XML
+ $remainder = substr( $chunk, $offset + 1 );
+ if ( !strncasecmp( $remainder, '?XML', 4 ) ) {
+ $nextChar = substr( $chunk, $offset + 5, 1 );
+ if ( $nextChar == ':' || $nextChar == ' ' || $nextChar == "\t" ) {
+ $found['xml'] = true;
+ }
+ }
+ // Scriptlet (JSP)
+ if ( !strncasecmp( $remainder, 'SCRIPTLET', 9 ) ) {
+ $found['scriptlet'] = true;
+ break;
+ }
+ // HTML
+ foreach ( $htmlTags as $tag ) {
+ if ( !strncasecmp( $remainder, $tag, strlen( $tag ) ) ) {
+ $found['html'] = true;
+ }
+ }
+ // Skip broken check for additional tags (HR etc.)
+
+ // CHANNEL replaced by RSS, RDF and FEED in IE 7
+ if ( $version < 'ie07' ) {
+ if ( !strncasecmp( $remainder, 'CHANNEL', 7 ) ) {
+ $found['cdf'] = true;
+ }
+ } else {
+ // RSS
+ if ( !strncasecmp( $remainder, 'RSS', 3 ) ) {
+ $found['rss'] = true;
+ break; // return from SampleData
+ }
+ if ( !strncasecmp( $remainder, 'rdf:RDF', 7 ) ) {
+ $found['rdf-tag'] = true;
+ // no break
+ }
+ if ( !strncasecmp( $remainder, 'FEED', 4 ) ) {
+ $found['atom'] = true;
+ break;
+ }
+ }
+ continue;
+ }
+ // Skip broken check for -->
+
+ // RSS URL checks
+ // For some reason both URLs must appear before it is recognised
+ $remainder = substr( $chunk, $offset );
+ if ( !strncasecmp( $remainder, $rdfUrl, strlen( $rdfUrl ) ) ) {
+ $found['rdf-url'] = true;
+ if ( isset( $found['rdf-tag'] )
+ && isset( $found['rdf-purl'] ) ) // [sic]
+ {
+ break;
+ }
+ continue;
+ }
+
+ if ( !strncasecmp( $remainder, $rdfPurl, strlen( $rdfPurl ) ) ) {
+ if ( isset( $found['rdf-tag'] )
+ && isset( $found['rdf-url'] ) ) // [sic]
+ {
+ break;
+ }
+ continue;
+ }
+
+ // XBM checks
+ if ( !strncasecmp( $remainder, $xbmMagic1, strlen( $xbmMagic1 ) ) ) {
+ $found['xbm1'] = true;
+ continue;
+ }
+ if ( $curChar == '_' ) {
+ if ( isset( $found['xbm2'] ) ) {
+ if ( !strncasecmp( $remainder, $xbmMagic3, strlen( $xbmMagic3 ) ) ) {
+ $found['xbm'] = true;
+ break;
+ }
+ } elseif ( isset( $found['xbm1'] ) ) {
+ if ( !strncasecmp( $remainder, $xbmMagic2, strlen( $xbmMagic2 ) ) ) {
+ $found['xbm2'] = true;
+ }
+ }
+ }
+
+ // BinHex
+ if ( !strncasecmp( $remainder, $binhexMagic, strlen( $binhexMagic ) ) ) {
+ $found['binhex'] = true;
+ }
+ }
+ return array( 'found' => $found, 'counters' => $counters );
+ }
+
+ protected function getDataFormat( $version, $type ) {
+ $types = $this->typeTable[$version];
+ if ( $type == '(null)' || strval( $type ) === '' ) {
+ return 'ambiguous';
+ }
+ foreach ( $types as $format => $list ) {
+ if ( in_array( $type, $list ) ) {
+ return $format;
+ }
+ }
+ return 'unknown';
+ }
+}
+