[SPIP] ~maj v3.2.9-->v3.2.11
[lhc/web/www.git] / www / plugins-dist / medias / lib / getid3 / module.misc.pdf.php
1 <?php
2
3 /////////////////////////////////////////////////////////////////
4 /// getID3() by James Heinrich <info@getid3.org> //
5 // available at https://github.com/JamesHeinrich/getID3 //
6 // or https://www.getid3.org //
7 // or http://getid3.sourceforge.net //
8 // see readme.txt for more details //
9 /////////////////////////////////////////////////////////////////
10 // //
11 // module.misc.pdf.php //
12 // module for analyzing PDF files //
13 // dependencies: NONE //
14 // ///
15 /////////////////////////////////////////////////////////////////
16
17 if (!defined('GETID3_INCLUDEPATH')) { // prevent path-exposing attacks that access modules directly on public webservers
18 exit;
19 }
20
21 class getid3_pdf extends getid3_handler
22 {
23 public $returnXREF = false; // return full details of PDF Cross-Reference Table (XREF)
24
25 /**
26 * @return bool
27 */
28 public function Analyze() {
29 $info = &$this->getid3->info;
30
31 $this->fseek(0);
32 if (preg_match('#^%PDF-([0-9\\.]+)$#', rtrim($this->fgets()), $matches)) {
33 $info['pdf']['header']['version'] = floatval($matches[1]);
34 $info['fileformat'] = 'pdf';
35
36 // the PDF Cross-Reference Table (XREF) is located near the end of the file
37 // the starting offset is specified in the penultimate section, on the two lines just before "%%EOF"
38 // the first line is "startxref", the second line is the byte offset of the XREF.
39 // We know the length of "%%EOF" and "startxref", but the offset could be 2-10 bytes,
40 // and we're not sure if the line ends are one or two bytes, so we might find "startxref" as little as 18(?) bytes
41 // from EOF, but it could 30 bytes, so we start 40 bytes back just to be safe and do a search for the data we want.
42 $this->fseek(-40, SEEK_END);
43 if (preg_match('#[\r\n]startxref[ \r\n]+([0-9]+)[ \r\n]+#', $this->fread(40), $matches)) {
44 $info['pdf']['trailer']['startxref'] = intval($matches[1]);
45 $this->parseXREF($info['pdf']['trailer']['startxref']);
46 if (!empty($info['pdf']['xref']['offset'])) {
47 while (!$this->feof() && (max(array_keys($info['pdf']['xref']['offset'])) > $info['pdf']['xref']['count'])) {
48 // suspect that there may be another XREF entry somewhere in the file, brute-force scan for it
49 /*
50 // starting at last known entry of main XREF table
51 $this->fseek(max($info['pdf']['xref']['offset']));
52 */
53 // starting at the beginning of the file
54 $this->fseek(0);
55 while (!$this->feof()) {
56 $XREFoffset = $this->ftell();
57 if (rtrim($this->fgets()) == 'xref') {
58 if (empty($info['pdf']['xref']['xref_offsets']) || !in_array($XREFoffset, $info['pdf']['xref']['xref_offsets'])) {
59 $this->parseXREF($XREFoffset);
60 break;
61 }
62 }
63 }
64 }
65 foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) {
66 if ($info['pdf']['xref']['entry'][$objectNumber] == 'f') {
67 // "free" object means "deleted", ignore
68 continue;
69 }
70 $this->fseek($offset);
71 $line = rtrim($this->fgets());
72 if (preg_match('#^'.$objectNumber.' ([0-9]+) obj#', $line, $matches)) {
73 if (strlen($line) > strlen($matches[0])) {
74 // object header line not actually on its own line, rewind file pointer to start reading data
75 $this->fseek($offset + strlen($matches[0]));
76 }
77 $objectData = '';
78 while (true) {
79 $line = $this->fgets();
80 if (rtrim($line) == 'endobj') {
81 break;
82 }
83 $objectData .= $line;
84 }
85 if (preg_match('#^<<[\r\n\s]*(/Type|/Pages|/Parent [0-9]+ [0-9]+ [A-Z]|/Count [0-9]+|/Kids *\\[[0-9A-Z ]+\\]|[\r\n\s])+[\r\n\s]*>>#', $objectData, $matches)) {
86 if (preg_match('#/Count ([0-9]+)#', $objectData, $matches)) {
87 $info['pdf']['pages'] = (int) $matches[1];
88 break; // for now this is the only data we're looking for in the PDF not need to loop through every object in the file (and a large PDF may contain MANY objects). And it MAY be possible that there are other objects elsewhere in the file that define additional (or removed?) pages
89 }
90 }
91 } else {
92 $this->error('Unexpected structure "'.$line.'" at offset '.$offset);
93 break;
94 }
95 }
96 if (!$this->returnXREF) {
97 unset($info['pdf']['xref']['offset'], $info['pdf']['xref']['generation'], $info['pdf']['xref']['entry']);
98 }
99
100 } else {
101 $this->error('Did not find "xref" at offset '.$info['pdf']['trailer']['startxref']);
102 }
103 } else {
104 $this->error('Did not find "startxref" in the last 40 bytes of the PDF');
105 }
106
107 $this->warning('PDF parsing incomplete in this version of getID3() ['.$this->getid3->version().']');
108 return true;
109 }
110 $this->error('Did not find "%PDF" at the beginning of the PDF');
111 return false;
112
113 }
114
115 /**
116 * @return bool
117 */
118 private function parseXREF($XREFoffset) {
119 $info = &$this->getid3->info;
120
121 $this->fseek($XREFoffset);
122 if (rtrim($this->fgets()) == 'xref') {
123
124 $info['pdf']['xref']['xref_offsets'][$XREFoffset] = $XREFoffset;
125 list($firstObjectNumber, $XREFcount) = explode(' ', rtrim($this->fgets()));
126 $XREFcount = (int) $XREFcount;
127 $info['pdf']['xref']['count'] = $XREFcount + (!empty($info['pdf']['xref']['count']) ? $info['pdf']['xref']['count'] : 0);
128 for ($i = 0; $i < $XREFcount; $i++) {
129 $line = rtrim($this->fgets());
130 if (preg_match('#^([0-9]+) ([0-9]+) ([nf])$#', $line, $matches)) {
131 $info['pdf']['xref']['offset'][($firstObjectNumber + $i)] = (int) $matches[1];
132 $info['pdf']['xref']['generation'][($firstObjectNumber + $i)] = (int) $matches[2];
133 $info['pdf']['xref']['entry'][($firstObjectNumber + $i)] = $matches[3];
134 } else {
135 $this->error('failed to parse XREF entry #'.$i.' in XREF table at offset '.$XREFoffset);
136 return false;
137 }
138 }
139 sort($info['pdf']['xref']['xref_offsets']);
140 return true;
141
142 }
143 $this->warning('failed to find expected XREF structure at offset '.$XREFoffset);
144 return false;
145 }
146
147 }