www/plugins-dist/medias/lib/getid3/module.misc.pdf.php

   1 <?php
   2
   3 /////////////////////////////////////////////////////////////////
   4 /// getID3() by James Heinrich <info@getid3.org>               //
   5 //  available at https://github.com/JamesHeinrich/getID3       //
   6 //            or https://www.getid3.org                        //
   7 //            or http://getid3.sourceforge.net                 //
   8 //  see readme.txt for more details                            //
   9 /////////////////////////////////////////////////////////////////
  10 //                                                             //
  11 // module.misc.pdf.php                                         //
  12 // module for analyzing PDF files                              //
  13 // dependencies: NONE                                          //
  14 //                                                            ///
  15 /////////////////////////////////////////////////////////////////
  16
  17 if (!defined('GETID3_INCLUDEPATH')) { // prevent path-exposing attacks that access modules directly on public webservers
  18         exit;
  19 }
  20
  21 class getid3_pdf extends getid3_handler
  22 {
  23         public $returnXREF = false; // return full details of PDF Cross-Reference Table (XREF)
  24
  25         /**
  26          * @return bool
  27          */
  28         public function Analyze() {
  29                 $info = &$this->getid3->info;
  30
  31                 $this->fseek(0);
  32                 if (preg_match('#^%PDF-([0-9\\.]+)$#', rtrim($this->fgets()), $matches)) {
  33                         $info['pdf']['header']['version'] = floatval($matches[1]);
  34                         $info['fileformat'] = 'pdf';
  35
  36                         // the PDF Cross-Reference Table (XREF) is located near the end of the file
  37                         // the starting offset is specified in the penultimate section, on the two lines just before "%%EOF"
  38                         // the first line is "startxref", the second line is the byte offset of the XREF.
  39                         // We know the length of "%%EOF" and "startxref", but the offset could be 2-10 bytes,
  40                         // and we're not sure if the line ends are one or two bytes, so we might find "startxref" as little as 18(?) bytes
  41                         // from EOF, but it could 30 bytes, so we start 40 bytes back just to be safe and do a search for the data we want.
  42                         $this->fseek(-40, SEEK_END);
  43                         if (preg_match('#[\r\n]startxref[ \r\n]+([0-9]+)[ \r\n]+#', $this->fread(40), $matches)) {
  44                                 $info['pdf']['trailer']['startxref'] = intval($matches[1]);
  45                                 $this->parseXREF($info['pdf']['trailer']['startxref']);
  46                                 if (!empty($info['pdf']['xref']['offset'])) {
  47                                         while (!$this->feof() && (max(array_keys($info['pdf']['xref']['offset'])) > $info['pdf']['xref']['count'])) {
  48                                                 // suspect that there may be another XREF entry somewhere in the file, brute-force scan for it
  49                                                 /*
  50                                                 // starting at last known entry of main XREF table
  51                                                 $this->fseek(max($info['pdf']['xref']['offset']));
  52                                                 */
  53                                                 // starting at the beginning of the file
  54                                                 $this->fseek(0);
  55                                                 while (!$this->feof()) {
  56                                                         $XREFoffset = $this->ftell();
  57                                                         if (rtrim($this->fgets()) == 'xref') {
  58                                                                 if (empty($info['pdf']['xref']['xref_offsets']) || !in_array($XREFoffset, $info['pdf']['xref']['xref_offsets'])) {
  59                                                                         $this->parseXREF($XREFoffset);
  60                                                                         break;
  61                                                                 }
  62                                                         }
  63                                                 }
  64                                         }
  65                                         foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) {
  66                                                 if ($info['pdf']['xref']['entry'][$objectNumber] == 'f') {
  67                                                         // "free" object means "deleted", ignore
  68                                                         continue;
  69                                                 }
  70                                                 $this->fseek($offset);
  71                                                 $line = rtrim($this->fgets());
  72                                                 if (preg_match('#^'.$objectNumber.' ([0-9]+) obj#', $line, $matches)) {
  73                                                         if (strlen($line) > strlen($matches[0])) {
  74                                                                 // object header line not actually on its own line, rewind file pointer to start reading data
  75                                                                 $this->fseek($offset + strlen($matches[0]));
  76                                                         }
  77                                                         $objectData  = '';
  78                                                         while (true) {
  79                                                                 $line = $this->fgets();
  80                                                                 if (rtrim($line) == 'endobj') {
  81                                                                         break;
  82                                                                 }
  83                                                                 $objectData .= $line;
  84                                                         }
  85                                                         if (preg_match('#^<<[\r\n\s]*(/Type|/Pages|/Parent [0-9]+ [0-9]+ [A-Z]|/Count [0-9]+|/Kids *\\[[0-9A-Z ]+\\]|[\r\n\s])+[\r\n\s]*>>#', $objectData, $matches)) {
  86                                                                 if (preg_match('#/Count ([0-9]+)#', $objectData, $matches)) {
  87                                                                         $info['pdf']['pages'] = (int) $matches[1];
  88                                                                         break; // for now this is the only data we're looking for in the PDF not need to loop through every object in the file (and a large PDF may contain MANY objects). And it MAY be possible that there are other objects elsewhere in the file that define additional (or removed?) pages
  89                                                                 }
  90                                                         }
  91                                                 } else {
  92                                                         $this->error('Unexpected structure "'.$line.'" at offset '.$offset);
  93                                                         break;
  94                                                 }
  95                                         }
  96                                         if (!$this->returnXREF) {
  97                                                 unset($info['pdf']['xref']['offset'], $info['pdf']['xref']['generation'], $info['pdf']['xref']['entry']);
  98                                         }
  99
 100                                 } else {
 101                                         $this->error('Did not find "xref" at offset '.$info['pdf']['trailer']['startxref']);
 102                                 }
 103                         } else {
 104                                 $this->error('Did not find "startxref" in the last 40 bytes of the PDF');
 105                         }
 106
 107                         $this->warning('PDF parsing incomplete in this version of getID3() ['.$this->getid3->version().']');
 108                         return true;
 109                 }
 110                 $this->error('Did not find "%PDF" at the beginning of the PDF');
 111                 return false;
 112
 113         }
 114
 115         /**
 116          * @return bool
 117          */
 118         private function parseXREF($XREFoffset) {
 119                 $info = &$this->getid3->info;
 120
 121                 $this->fseek($XREFoffset);
 122                 if (rtrim($this->fgets()) == 'xref') {
 123
 124                         $info['pdf']['xref']['xref_offsets'][$XREFoffset] = $XREFoffset;
 125                         list($firstObjectNumber, $XREFcount) = explode(' ', rtrim($this->fgets()));
 126                         $XREFcount = (int) $XREFcount;
 127                         $info['pdf']['xref']['count'] = $XREFcount + (!empty($info['pdf']['xref']['count']) ? $info['pdf']['xref']['count'] : 0);
 128                         for ($i = 0; $i < $XREFcount; $i++) {
 129                                 $line = rtrim($this->fgets());
 130                                 if (preg_match('#^([0-9]+) ([0-9]+) ([nf])$#', $line, $matches)) {
 131                                         $info['pdf']['xref']['offset'][($firstObjectNumber + $i)]     = (int) $matches[1];
 132                                         $info['pdf']['xref']['generation'][($firstObjectNumber + $i)] = (int) $matches[2];
 133                                         $info['pdf']['xref']['entry'][($firstObjectNumber + $i)]      =       $matches[3];
 134                                 } else {
 135                                         $this->error('failed to parse XREF entry #'.$i.' in XREF table at offset '.$XREFoffset);
 136                                         return false;
 137                                 }
 138                         }
 139                         sort($info['pdf']['xref']['xref_offsets']);
 140                         return true;
 141
 142                 }
 143                 $this->warning('failed to find expected XREF structure at offset '.$XREFoffset);
 144                 return false;
 145         }
 146
 147 }