From 69905ce9c7dc2a5dcc2f0c6fee424d307d297e8e Mon Sep 17 00:00:00 2001 From: Brian Wolff Date: Tue, 24 Jun 2014 16:15:32 -0300 Subject: [PATCH] Fix mime detection of easily-confused-with text/plain formats json, csv, and tsv are often detected as text/plain. However that's not right. This patch causes MediaWiki to look at the file extension of files detected as text/plain, and if the file extension is for a "textual" type, use the mime type associated with that extension. This change also changes the "does mime type match uploaded file extension" check to use the mime based on the file contents plus extension, as opposed to just the file contents. Various documentation suggests this is more appropriate (e.g. line 807 of MimeMagic.php). In my opinion we should use just the file contents when verifying file is not on blacklist, but use ext when verifying file type matches extension, and for decided what handler specific checks to run. Not the detect mime type with extension doesn't override the detected mime type with the extension, but only uses the extension if content based detection is ambigious or not specific enough. This patch should be reviewed by csteipp before merge for any potential security implications. Note: This is partially fixing a regression from 3846d1048766a7, where previously csv and json files were allowed to be uploaded, and that change prevented them Bug: 66036 Bug: 45424 Change-Id: Ib637fe6850a81b26f84dc8c00ab4772f3d3a1f34 --- includes/MimeMagic.php | 14 ++++----- includes/mime.info | 3 ++ includes/mime.types | 2 ++ includes/upload/UploadBase.php | 2 +- tests/phpunit/includes/MimeMagicTest.php | 39 ++++++++++++++++++++++++ 5 files changed, 51 insertions(+), 9 deletions(-) create mode 100644 tests/phpunit/includes/MimeMagicTest.php diff --git a/includes/MimeMagic.php b/includes/MimeMagic.php index f4d4697c38..f258fe1541 100644 --- a/includes/MimeMagic.php +++ b/includes/MimeMagic.php @@ -485,14 +485,6 @@ class MimeMagic { * by looking at the file extension. Typically, this method would be called on the * result of guessMimeType(). * - * Currently, this method does the following: - * - * If $mime is "unknown/unknown" and isRecognizableExtension( $ext ) returns false, - * return the result of guessTypesForExtension($ext). - * - * If $mime is "application/x-opc+zip" and isMatchingExtension( $ext, $mime ) - * gives true, return the result of guessTypesForExtension($ext). - * * @param string $mime The mime type, typically guessed from a file's content. * @param string $ext The file extension, as taken from the file name * @@ -518,6 +510,12 @@ class MimeMagic { ".$ext is not a known OPC extension.\n" ); $mime = 'application/zip'; } + } elseif ( $mime === 'text/plain' && $this->findMediaType( ".$ext" ) === MEDIATYPE_TEXT ) { + // Textual types are sometimes not recognized properly. + // If detected as text/plain, and has an extension which is textual + // improve to the extension's type. For example, csv and json are often + // misdetected as text/plain. + $mime = $this->guessTypesForExtension( $ext ); } if ( isset( $this->mMimeTypeAliases[$mime] ) ) { diff --git a/includes/mime.info b/includes/mime.info index c7981871c2..b9ad10339d 100644 --- a/includes/mime.info +++ b/includes/mime.info @@ -65,6 +65,9 @@ text/plain [TEXT] text/html application/xhtml+xml [TEXT] application/xml text/xml [TEXT] text [TEXT] +application/json [TEXT] +text/csv [TEXT] +text/tab-separated-values [TEXT] application/zip application/x-zip [ARCHIVE] application/x-gzip [ARCHIVE] diff --git a/includes/mime.types b/includes/mime.types index 61f7ff5810..718f2e2927 100644 --- a/includes/mime.types +++ b/includes/mime.types @@ -35,6 +35,7 @@ application/x-gzip gz application/x-hdf hdf application/x-jar jar application/x-javascript js +application/json json application/x-koan skp skd skt skm application/x-latex latex application/x-netcdf nc cdf @@ -109,6 +110,7 @@ model/mesh msh mesh silo model/vrml wrl vrml text/calendar ics ifb text/css css +text/csv csv text/html html htm text/plain txt text/richtext rtx diff --git a/includes/upload/UploadBase.php b/includes/upload/UploadBase.php index b8ca434038..5cb67c8d8a 100644 --- a/includes/upload/UploadBase.php +++ b/includes/upload/UploadBase.php @@ -435,7 +435,7 @@ abstract class UploadBase { } $this->mFileProps = FSFile::getPropsFromPath( $this->mTempPath, $this->mFinalExtension ); - $mime = $this->mFileProps['file-mime']; + $mime = $this->mFileProps['mime']; if ( $wgVerifyMimeType ) { # XXX: Missing extension will be caught by validateName() via getTitle() diff --git a/tests/phpunit/includes/MimeMagicTest.php b/tests/phpunit/includes/MimeMagicTest.php new file mode 100644 index 0000000000..9582fe7f38 --- /dev/null +++ b/tests/phpunit/includes/MimeMagicTest.php @@ -0,0 +1,39 @@ +mimeMagic = MimeMagic::singleton(); + parent::setUp(); + } + + /** + * @dataProvider providerImproveTypeFromExtension + * @param $ext String File extension (no leading dot) + * @param $oldMime String Initially detected mime + * @param $expectedMime String Mime type after taking extension into account + */ + function testImproveTypeFromExtension( $ext, $oldMime, $expectedMime ) { + $actualMime = $this->mimeMagic->improveTypeFromExtension( $oldMime, $ext ); + $this->assertEquals( $expectedMime, $actualMime ); + } + + function providerImproveTypeFromExtension() { + return array( + array( 'gif', 'image/gif', 'image/gif' ), + array( 'gif', 'unknown/unknown', 'unknown/unknown' ), + array( 'wrl', 'unknown/unknown', 'model/vrml' ), + array( 'txt', 'text/plain', 'text/plain' ), + array( 'csv', 'text/plain', 'text/csv' ), + array( 'tsv', 'text/plain', 'text/tab-separated-values' ), + array( 'json', 'text/plain', 'application/json' ), + array( 'foo', 'application/x-opc+zip', 'application/zip' ), + array( 'docx', 'application/x-opc+zip', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ), + array( 'djvu', 'image/x-djvu', 'image/vnd.djvu' ), + array( 'wav', 'audio/wav', 'audio/wav' ), + ); + } + +} -- 2.20.1