follow-up r86169 - 2 minor issues found while writing unit tests
authorBrian Wolff <bawolff@users.mediawiki.org>
Sun, 21 Aug 2011 17:16:57 +0000 (17:16 +0000)
committerBrian Wolff <bawolff@users.mediawiki.org>
Sun, 21 Aug 2011 17:16:57 +0000 (17:16 +0000)
# Some really obscure Exif properties did not have the Exif byte order taken into account
  and were being extracted with the bytes reversed (for example user comment when encoded as utf-16).
  Not a major issue as these properties are very rare in practise, but certainly not a good thing.
  ( One would think php's exif support would take care of that, but no it does not...)
# Change the fallback encoding for Gif comments to be windows-1252 instead of iso 8859-1. More
  to be consitent with jpg and iptc then anything else.

includes/media/BitmapMetadataHandler.php
includes/media/Exif.php
includes/media/GIFMetadataExtractor.php
includes/media/JpegMetadataExtractor.php
includes/media/Tiff.php

index 13188ec..d1caa67 100644 (file)
@@ -40,16 +40,16 @@ class BitmapMetadataHandler {
 
 
        /**
-        *
-        * get exif info using exif class.
+        * Get exif info using exif class.
         * Basically what used to be in BitmapHandler::getMetadata().
         * Just calls stuff in the Exif class.
         *
         * @param $filename string
         */
-       function getExif ( $filename ) {
-               if ( file_exists( $filename ) ) {
-                       $exif = new Exif( $filename );
+       function getExif ( $filename, $byteOrder ) {
+               global $wgShowEXIF;
+               if ( file_exists( $filename ) && $wgShowEXIF ) {
+                       $exif = new Exif( $filename, $byteOrder );
                        $data = $exif->getFilteredData();
                        if ( $data ) {
                                $this->addMetadata( $data, 'exif' );
@@ -117,7 +117,6 @@ class BitmapMetadataHandler {
        static function Jpeg ( $filename ) {
                $showXMP = function_exists( 'xml_parser_create_ns' );
                $meta = new self();
-               $meta->getExif( $filename );
 
                $seg = JpegMetadataExtractor::segmentSplitter( $filename );
                if ( isset( $seg['COM'] ) && isset( $seg['COM'][0] ) ) {
@@ -141,6 +140,9 @@ class BitmapMetadataHandler {
                                $meta->addMetadata( $array, $type );
                        }
                }
+               if ( isset( $seg['byteOrder'] ) ) {
+                       $meta->getExif( $filename, $seg['byteOrder'] );
+               }
                return $meta->getMetadataArray();
        }
 
@@ -208,4 +210,60 @@ class BitmapMetadataHandler {
                return $baseArray;
        }
 
+       /**
+        * This doesn't do much yet, but eventually I plan to add
+        * XMP support for Tiff. (PHP's exif support already extracts
+        * but needs some further processing because PHP's exif support
+        * is stupid...)
+        *
+        * @todo Add XMP support, so this function actually makes
+        * sense to put here.
+        *
+        * The various exceptions this throws are caught later.
+        * @param $filename String
+        * @return Array The metadata.
+        */
+       static public function Tiff ( $filename ) {
+               if ( file_exists( $filename ) ) {
+                       $byteOrder = self::getTiffByteOrder( $filename );
+                       if ( !$byteOrder ) {
+                               throw new MWException( "Error determining byte order of $filename" );
+                       }
+                       $exif = new Exif( $filename, $byteOrder );
+                       $data = $exif->getFilteredData();
+                       if ( $data ) {
+                               $data['MEDIAWIKI_EXIF_VERSION'] = Exif::version();
+                               return $data;
+                       } else {
+                               throw new MWException( "Could not extract data from tiff file $filename" );
+                       }
+               } else {
+                       throw new MWException( "File doesn't exist - $filename" );
+               }
+       }
+       /**
+        * Read the first 2 bytes of a tiff file to figure out
+        * Little Endian or Big Endian. Needed for exif stuff.
+        *
+        * @param $filename String The filename
+        * @return String 'BE' or 'LE' or false
+        */
+       static function getTiffByteOrder( $filename ) {
+               $fh = fopen( $filename, 'rb' );
+               if ( !$fh ) return false;
+               $head = fread( $fh, 2 );
+               fclose( $fh );
+
+               switch( $head ) {
+                       case 'II':
+                               return 'LE'; // II for intel.
+                       case 'MM':
+                               return 'BE'; // MM for motorla.
+                       default:
+                               return false; // Something went wrong.
+
+               }
+       }
+
+
 }
index 2836d31..345a6f1 100644 (file)
@@ -90,6 +90,11 @@ class Exif {
         */
        var $log = false;
 
+       /**
+        * The byte order of the file. Needed because php's
+        * extension doesn't fully process some obscure props.
+        */
+       private $byteOrder;
        //@}
 
        /**
@@ -102,7 +107,7 @@ class Exif {
         * DigitalZoomRatio = 0/0 is rejected. need to determine if that's valid.
         * possibly should treat 0/0 = 0. need to read exif spec on that.
         */
-       function __construct( $file ) {
+       function __construct( $file, $byteOrder = '' ) {
                /**
                 * Page numbers here refer to pages in the EXIF 2.2 standard
                 *
@@ -275,6 +280,16 @@ class Exif {
 
                $this->file = $file;
                $this->basename = wfBaseName( $this->file );
+               if ( $byteOrder === 'BE' || $byteOrder === 'LE' ) {
+                       $this->byteOrder = $byteOrder;
+               } else {
+                       // Only give a warning for b/c, since originally we didn't
+                       // require this. The number of things affected by this is
+                       // rather small.
+                       wfWarn( 'Exif class did not have byte order specified. '
+                        . 'Some properties may be decoded incorrectly.' );
+                       $this->byteOrder = 'BE'; // BE seems about twice as popular as LE in jpg's.
+               }
 
                $this->debugFile( $this->basename, __FUNCTION__, true );
                if( function_exists( 'exif_read_data' ) ) {
@@ -394,7 +409,16 @@ class Exif {
                                }
                                $newVal .= ord( substr($val, $i, 1) );
                        }
-                       $this->mFilteredExifData['GPSVersionID'] = $newVal;
+                       if ( $this->byteOrder === 'LE' ) {
+                               // Need to reverse the string
+                               $newVal2 = '';
+                               for ( $i = strlen( $newVal ) - 1; $i >= 0; $i-- ) {
+                                       $newVal2 .= substr( $newVal, $i, 1 );
+                               }
+                               $this->mFilteredExifData['GPSVersionID'] = $newVal2;
+                       } else {
+                               $this->mFilteredExifData['GPSVersionID'] = $newVal;
+                       }
                        unset( $this->mFilteredExifData['GPSVersion'] );
                }
 
@@ -415,7 +439,6 @@ class Exif {
                                unset($this->mFilteredExifData[$prop]);
                                return;
                        }
-
                        $charCode = substr( $this->mFilteredExifData[$prop], 0, 8);
                        $val = substr( $this->mFilteredExifData[$prop], 8);
                        
@@ -426,7 +449,7 @@ class Exif {
                                        $charset = "Shift-JIS";
                                        break;
                                case "UNICODE\x00":
-                                       $charset = "UTF-16";
+                                       $charset = "UTF-16" . $this->byteOrder;
                                        break;
                                default: //ascii or undefined.
                                        $charset = "";
index a3d3ced..02a9c60 100644 (file)
@@ -126,14 +126,14 @@ class GIFMetadataExtractor {
 
                                        // The standard says this should be ASCII, however its unclear if
                                        // thats true in practise. Check to see if its valid utf-8, if so
-                                       // assume its that, otherwise assume its iso-8859-1
+                                       // assume its that, otherwise assume its windows-1252 (iso-8859-1)
                                        $dataCopy = $data;
                                        // quickIsNFCVerify has the side effect of replacing any invalid characters
                                        UtfNormal::quickIsNFCVerify( $dataCopy );
 
                                        if ( $dataCopy !== $data ) {
                                                wfSuppressWarnings();
-                                               $data = iconv( 'ISO-8859-1', 'UTF-8', $data );
+                                               $data = iconv( 'windows-1252', 'UTF-8', $data );
                                                wfRestoreWarnings();
                                        }
 
index 984bbe0..51b0ffa 100644 (file)
@@ -28,7 +28,10 @@ class JpegMetadataExtractor {
 
                $segmentCount = 0;
 
-               $segments = array( 'XMP_ext' => array(), 'COM' => array() );
+               $segments = array(
+                       'XMP_ext' => array(),
+                       'COM' => array(),
+               );
 
                if ( !$filename ) {
                        throw new MWException( "No filename specified for " . __METHOD__ );
@@ -82,23 +85,34 @@ class JpegMetadataExtractor {
                                        wfDebug( __METHOD__ . ' Ignoring JPEG comment as is garbage.' );
                                }
 
-                       } elseif ( $buffer === "\xE1" && $showXMP ) {
+                       } elseif ( $buffer === "\xE1" ) {
                                // APP1 section (Exif, XMP, and XMP extended)
                                // only extract if XMP is enabled.
                                $temp = self::jpegExtractMarker( $fh );
-
                                // check what type of app segment this is.
-                               if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" ) {
+                               if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
                                        $segments["XMP"] = substr( $temp, 29 );
-                               } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" ) {
+                               } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
                                        $segments["XMP_ext"][] = substr( $temp, 35 );
-                               } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" ) {
+                               } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
                                        // Some images (especially flickr images) seem to have this.
                                        // I really have no idea what the deal is with them, but
                                        // whatever...
                                        $segments["XMP"] = substr( $temp, 29 );
                                        wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
                                                . "Using anyways.\n" ); 
+                               } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
+                                       // Just need to find out what the byte order is.
+                                       // because php's exif plugin sucks...
+                                       // This is a II for little Endian, MM for big. Not a unicode BOM.
+                                       $byteOrderMarker = substr( $temp, 6, 2 );
+                                       if ( $byteOrderMarker === 'MM' ) {
+                                               $segments['byteOrder'] = 'BE';
+                                       } elseif ( $byteOrderMarker === 'II' ) {
+                                               $segments['byteOrder'] = 'LE';
+                                       } else {
+                                               wfDebug( __METHOD__ . ' Invalid byte ordering?!' );
+                                       }
                                }
                        } elseif ( $buffer === "\xED" ) {
                                // APP13 - PSIR. IPTC and some photoshop stuff
index 2674267..0a10389 100644 (file)
@@ -56,13 +56,20 @@ class TiffHandler extends ExifBitmapHandler {
         */
        function getMetadata( $image, $filename ) {
                global $wgShowEXIF;
-               if ( $wgShowEXIF && file_exists( $filename ) ) {
-                       $exif = new Exif( $filename );
-                       $data = $exif->getFilteredData();
-                       if ( $data ) {
-                               $data['MEDIAWIKI_EXIF_VERSION'] = Exif::version();
-                               return serialize( $data );
-                       } else {
+               if ( $wgShowEXIF ) {
+                       try {
+                               $meta = BitmapMetadataHandler::Tiff( $filename );
+                               if ( !is_array( $meta ) ) {
+                                       // This should never happen, but doesn't hurt to be paranoid.
+                                       throw new MWException('Metadata array is not an array');
+                               }
+                               $meta['MEDIAWIKI_EXIF_VERSION'] = Exif::version();
+                               return serialize( $meta );
+                       }
+                       catch ( MWException $e ) {
+                               // BitmapMetadataHandler throws an exception in certain exceptional
+                               // cases like if file does not exist.
+                               wfDebug( __METHOD__ . ': ' . $e->getMessage() . "\n" );
                                return ExifBitmapHandler::BROKEN_FILE;
                        }
                } else {