Update formatting for media related classes
[lhc/web/wiklou.git] / includes / media / IPTC.php
1 <?php
2 /**
3 * Class for some IPTC functions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Media
22 */
23
24 /**
25 * Class for some IPTC functions.
26 *
27 * @ingroup Media
28 */
29 class IPTC {
30 /**
31 * This takes the results of iptcparse() and puts it into a
32 * form that can be handled by mediawiki. Generally called from
33 * BitmapMetadataHandler::doApp13.
34 *
35 * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
36 *
37 * @param string $rawData app13 block from jpeg containing iptc/iim data
38 * @return Array iptc metadata array
39 */
40 static function parse( $rawData ) {
41 $parsed = iptcparse( $rawData );
42 $data = array();
43 if ( !is_array( $parsed ) ) {
44 return $data;
45 }
46
47 $c = '';
48 //charset info contained in tag 1:90.
49 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
50 $c = self::getCharset( $parsed['1#090'][0] );
51 if ( $c === false ) {
52 //Unknown charset. refuse to parse.
53 //note: There is a different between
54 //unknown and no charset specified.
55 return array();
56 }
57 unset( $parsed['1#090'] );
58 }
59
60 foreach ( $parsed as $tag => $val ) {
61 if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
62 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
63 continue;
64 }
65 switch ( $tag ) {
66 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
67 $data['ImageDescription'] = self::convIPTC( $val, $c );
68 break;
69 case '2#116': /* copyright. Mapped with exif copyright */
70 $data['Copyright'] = self::convIPTC( $val, $c );
71 break;
72 case '2#080': /* byline. Mapped with exif Artist */
73 /* merge with byline title (2:85)
74 * like how exif does it with
75 * Title, person. Not sure if this is best
76 * approach since we no longer have the two fields
77 * separate. each byline title entry corresponds to a
78 * specific byline. */
79
80 $bylines = self::convIPTC( $val, $c );
81 if ( isset( $parsed['2#085'] ) ) {
82 $titles = self::convIPTC( $parsed['2#085'], $c );
83 } else {
84 $titles = array();
85 }
86
87 for ( $i = 0; $i < count( $titles ); $i++ ) {
88 if ( isset( $bylines[$i] ) ) {
89 // theoretically this should always be set
90 // but doesn't hurt to be careful.
91 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
92 }
93 }
94 $data['Artist'] = $bylines;
95 break;
96 case '2#025': /* keywords */
97 $data['Keywords'] = self::convIPTC( $val, $c );
98 break;
99 case '2#101': /* Country (shown)*/
100 $data['CountryDest'] = self::convIPTC( $val, $c );
101 break;
102 case '2#095': /* state/province (shown) */
103 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
104 break;
105 case '2#090': /* city (Shown) */
106 $data['CityDest'] = self::convIPTC( $val, $c );
107 break;
108 case '2#092': /* sublocation (shown) */
109 $data['SublocationDest'] = self::convIPTC( $val, $c );
110 break;
111 case '2#005': /* object name/title */
112 $data['ObjectName'] = self::convIPTC( $val, $c );
113 break;
114 case '2#040': /* special instructions */
115 $data['SpecialInstructions'] = self::convIPTC( $val, $c );
116 break;
117 case '2#105': /* headline*/
118 $data['Headline'] = self::convIPTC( $val, $c );
119 break;
120 case '2#110': /* credit */
121 /*"Identifies the provider of the objectdata,
122 * not necessarily the owner/creator". */
123 $data['Credit'] = self::convIPTC( $val, $c );
124 break;
125 case '2#115': /* source */
126 /* "Identifies the original owner of the intellectual content of the
127 *objectdata. This could be an agency, a member of an agency or
128 *an individual." */
129 $data['Source'] = self::convIPTC( $val, $c );
130 break;
131
132 case '2#007': /* edit status (lead, correction, etc) */
133 $data['EditStatus'] = self::convIPTC( $val, $c );
134 break;
135 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
136 $data['iimCategory'] = self::convIPTC( $val, $c );
137 break;
138 case '2#020': /* category. deprecated. */
139 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
140 break;
141 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
142 $data['Urgency'] = self::convIPTC( $val, $c );
143 break;
144 case '2#022':
145 /* "Identifies objectdata that recurs often and predictably...
146 * Example: Euroweather" */
147 $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
148 break;
149 case '2#026':
150 /* Content location code (iso 3166 + some custom things)
151 * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
152 * See wikipedia article on iso 3166 and appendix D of iim std. */
153 $data['LocationDestCode'] = self::convIPTC( $val, $c );
154 break;
155 case '2#027':
156 /* Content location name. Full printable name
157 * of location of photo. */
158 $data['LocationDest'] = self::convIPTC( $val, $c );
159 break;
160 case '2#065':
161 /* Originating Program.
162 * Combine with Program version (2:70) if present.
163 */
164 $software = self::convIPTC( $val, $c );
165
166 if ( count( $software ) !== 1 ) {
167 //according to iim standard this cannot have multiple values
168 //so if there is more than one, something weird is happening,
169 //and we skip it.
170 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
171 break;
172 }
173
174 if ( isset( $parsed['2#070'] ) ) {
175 //if a version is set for the software.
176 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
177 unset( $parsed['2#070'] );
178 $data['Software'] = array( array( $software[0], $softwareVersion[0] ) );
179 } else {
180 $data['Software'] = $software;
181 }
182 break;
183 case '2#075':
184 /* Object cycle.
185 * a for morning (am), p for evening, b for both */
186 $data['ObjectCycle'] = self::convIPTC( $val, $c );
187 break;
188 case '2#100':
189 /* Country/Primary location code.
190 * "Indicates the code of the country/primary location where the
191 * intellectual property of the objectdata was created"
192 * unclear how this differs from 2#026
193 */
194 $data['CountryCodeDest'] = self::convIPTC( $val, $c );
195 break;
196 case '2#103':
197 /* original transmission ref.
198 * "A code representing the location of original transmission ac-
199 * cording to practises of the provider."
200 */
201 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
202 break;
203 case '2#118': /*contact*/
204 $data['Contact'] = self::convIPTC( $val, $c );
205 break;
206 case '2#122':
207 /* Writer/Editor
208 * "Identification of the name of the person involved in the writing,
209 * editing or correcting the objectdata or caption/abstract."
210 */
211 $data['Writer'] = self::convIPTC( $val, $c );
212 break;
213 case '2#135': /* lang code */
214 $data['LanguageCode'] = self::convIPTC( $val, $c );
215 break;
216
217 // Start date stuff.
218 // It doesn't accept incomplete dates even though they are valid
219 // according to spec.
220 // Should potentially store timezone as well.
221 case '2#055':
222 //Date created (not date digitized).
223 //Maps to exif DateTimeOriginal
224 if ( isset( $parsed['2#060'] ) ) {
225 $time = $parsed['2#060'];
226 } else {
227 $time = array();
228 }
229 $timestamp = self::timeHelper( $val, $time, $c );
230 if ( $timestamp ) {
231 $data['DateTimeOriginal'] = $timestamp;
232 }
233 break;
234
235 case '2#062':
236 //Date converted to digital representation.
237 //Maps to exif DateTimeDigitized
238 if ( isset( $parsed['2#063'] ) ) {
239 $time = $parsed['2#063'];
240 } else {
241 $time = array();
242 }
243 $timestamp = self::timeHelper( $val, $time, $c );
244 if ( $timestamp ) {
245 $data['DateTimeDigitized'] = $timestamp;
246 }
247 break;
248
249 case '2#030':
250 //Date released.
251 if ( isset( $parsed['2#035'] ) ) {
252 $time = $parsed['2#035'];
253 } else {
254 $time = array();
255 }
256 $timestamp = self::timeHelper( $val, $time, $c );
257 if ( $timestamp ) {
258 $data['DateTimeReleased'] = $timestamp;
259 }
260 break;
261
262 case '2#037':
263 //Date expires.
264 if ( isset( $parsed['2#038'] ) ) {
265 $time = $parsed['2#038'];
266 } else {
267 $time = array();
268 }
269 $timestamp = self::timeHelper( $val, $time, $c );
270 if ( $timestamp ) {
271 $data['DateTimeExpires'] = $timestamp;
272 }
273 break;
274
275 case '2#000': /* iim version */
276 // unlike other tags, this is a 2-byte binary number.
277 //technically this is required if there is iptc data
278 //but in practise it isn't always there.
279 if ( strlen( $val[0] ) == 2 ) {
280 //if is just to be paranoid.
281 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
282 $versionValue += ord( substr( $val[0], 1, 1 ) );
283 $data['iimVersion'] = $versionValue;
284 }
285 break;
286
287 case '2#004':
288 // IntellectualGenere.
289 // first 4 characters are an id code
290 // That we're not really interested in.
291
292 // This prop is weird, since it's
293 // allowed to have multiple values
294 // in iim 4.1, but not in the XMP
295 // stuff. We're going to just
296 // extract the first value.
297 $con = self::ConvIPTC( $val, $c );
298 if ( strlen( $con[0] ) < 5 ) {
299 wfDebugLog( 'iptc', 'IPTC: '
300 . '2:04 too short. '
301 . 'Ignoring.' );
302 break;
303 }
304 $extracted = substr( $con[0], 4 );
305 $data['IntellectualGenre'] = $extracted;
306 break;
307
308 case '2#012':
309 // Subject News code - this is a compound field
310 // at the moment we only extract the subject news
311 // code, which is an 8 digit (ascii) number
312 // describing the subject matter of the content.
313 $codes = self::convIPTC( $val, $c );
314 foreach ( $codes as $ic ) {
315 $fields = explode( ':', $ic, 3 );
316
317 if ( count( $fields ) < 2 || $fields[0] !== 'IPTC' ) {
318 wfDebugLog( 'IPTC', 'IPTC: '
319 . 'Invalid 2:12 - ' . $ic );
320 break;
321 }
322 $data['SubjectNewsCode'] = $fields[1];
323 }
324 break;
325
326 // purposely does not do 2:125, 2:130, 2:131,
327 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
328 // 2:200, 2:201, 2:202
329 // or the audio stuff (2:150 to 2:154)
330
331 case '2#070':
332 case '2#060':
333 case '2#063':
334 case '2#085':
335 case '2#038':
336 case '2#035':
337 //ignore. Handled elsewhere.
338 break;
339
340 default:
341 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ) );
342 break;
343 }
344 }
345
346 return $data;
347 }
348
349 /**
350 * Convert an iptc date and time tags into the exif format
351 *
352 * @todo Potentially this should also capture the timezone offset.
353 * @param array $date The date tag
354 * @param array $time The time tag
355 * @param $c
356 * @return String Date in exif format.
357 */
358 private static function timeHelper( $date, $time, $c ) {
359 if ( count( $date ) === 1 ) {
360 //the standard says this should always be 1
361 //just double checking.
362 list( $date ) = self::convIPTC( $date, $c );
363 } else {
364 return null;
365 }
366
367 if ( count( $time ) === 1 ) {
368 list( $time ) = self::convIPTC( $time, $c );
369 $dateOnly = false;
370 } else {
371 $time = '000000+0000'; //placeholder
372 $dateOnly = true;
373 }
374
375 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
376 && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
377 && substr( $date, 0, 4 ) !== '0000'
378 && substr( $date, 4, 2 ) !== '00'
379 && substr( $date, 6, 2 ) !== '00'
380 ) ) {
381 //something wrong.
382 // Note, this rejects some valid dates according to iptc spec
383 // for example: the date 00000400 means the photo was taken in
384 // April, but the year and day is unknown. We don't process these
385 // types of incomplete dates atm.
386 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
387
388 return null;
389 }
390
391 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ) );
392 if ( $unixTS === false ) {
393 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
394
395 return null;
396 }
397
398 $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
399 + ( intval( substr( $time, 9, 2 ) ) * 60 );
400
401 if ( substr( $time, 6, 1 ) === '-' ) {
402 $tz = -$tz;
403 }
404
405 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
406 if ( $finalTimestamp === false ) {
407 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
408
409 return null;
410 }
411 if ( $dateOnly ) {
412 //return the date only
413 return substr( $finalTimestamp, 0, 10 );
414 } else {
415 return $finalTimestamp;
416 }
417 }
418
419 /**
420 * Helper function to convert charset for iptc values.
421 * @param string|array $data The iptc string
422 * @param string $charset The charset
423 *
424 * @return string|array
425 */
426 private static function convIPTC( $data, $charset ) {
427 if ( is_array( $data ) ) {
428 foreach ( $data as &$val ) {
429 $val = self::convIPTCHelper( $val, $charset );
430 }
431 } else {
432 $data = self::convIPTCHelper( $data, $charset );
433 }
434
435 return $data;
436 }
437
438 /**
439 * Helper function of a helper function to convert charset for iptc values.
440 * @param $data Mixed String or Array: The iptc string
441 * @param string $charset The charset
442 *
443 * @return string
444 */
445 private static function convIPTCHelper( $data, $charset ) {
446 if ( $charset ) {
447 wfSuppressWarnings();
448 $data = iconv( $charset, "UTF-8//IGNORE", $data );
449 wfRestoreWarnings();
450 if ( $data === false ) {
451 $data = "";
452 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
453 }
454 } else {
455 //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
456 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
457 $oldData = $data;
458 UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
459 if ( $data === $oldData ) {
460 return $data; //if validation didn't change $data
461 } else {
462 return self::convIPTCHelper( $oldData, 'Windows-1252' );
463 }
464 }
465
466 return trim( $data );
467 }
468
469 /**
470 * take the value of 1:90 tag and returns a charset
471 * @param string $tag 1:90 tag.
472 * @return string charset name or "?"
473 * Warning, this function does not (and is not intended to) detect
474 * all iso 2022 escape codes. In practise, the code for utf-8 is the
475 * only code that seems to have wide use. It does detect that code.
476 */
477 static function getCharset( $tag ) {
478
479 //According to iim standard, charset is defined by the tag 1:90.
480 //in which there are iso 2022 escape sequences to specify the character set.
481 //the iim standard seems to encourage that all necessary escape sequences are
482 //in the 1:90 tag, but says it doesn't have to be.
483
484 //This is in need of more testing probably. This is definitely not complete.
485 //however reading the docs of some other iptc software, it appears that most iptc software
486 //only recognizes utf-8. If 1:90 tag is not present content is
487 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
488
489 //This also won't work if there are more than one escape sequence in the 1:90 tag
490 //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
491
492 // This is just going through the charsets mentioned in appendix C of the iim standard.
493
494 // \x1b = ESC.
495 switch ( $tag ) {
496 case "\x1b%G": //utf-8
497 //Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
498 case "\x1b(B": // ascii
499 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
500 $c = 'UTF-8';
501 break;
502 case "\x1b(A": //like ascii, but british.
503 $c = 'ISO646-GB';
504 break;
505 case "\x1b(C": //some obscure sweedish/finland encoding
506 $c = 'ISO-IR-8-1';
507 break;
508 case "\x1b(D":
509 $c = 'ISO-IR-8-2';
510 break;
511 case "\x1b(E": //some obscure danish/norway encoding
512 $c = 'ISO-IR-9-1';
513 break;
514 case "\x1b(F":
515 $c = 'ISO-IR-9-2';
516 break;
517 case "\x1b(G":
518 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
519 break;
520 case "\x1b(I":
521 $c = "ISO646-IT";
522 break;
523 case "\x1b(L":
524 $c = "ISO646-PT";
525 break;
526 case "\x1b(Z":
527 $c = "ISO646-ES";
528 break;
529 case "\x1b([":
530 $c = "GREEK7-OLD";
531 break;
532 case "\x1b(K":
533 $c = "ISO646-DE";
534 break;
535 case "\x1b(N": //crylic
536 $c = "ISO_5427";
537 break;
538 case "\x1b(`": //iso646-NO
539 $c = "NS_4551-1";
540 break;
541 case "\x1b(f": //iso646-FR
542 $c = "NF_Z_62-010";
543 break;
544 case "\x1b(g":
545 $c = "PT2"; //iso646-PT2
546 break;
547 case "\x1b(h":
548 $c = "ES2";
549 break;
550 case "\x1b(i": //iso646-HU
551 $c = "MSZ_7795.3";
552 break;
553 case "\x1b(w":
554 $c = "CSA_Z243.4-1985-1";
555 break;
556 case "\x1b(x":
557 $c = "CSA_Z243.4-1985-2";
558 break;
559 case "\x1b\$(B":
560 case "\x1b\$B":
561 case "\x1b&@\x1b\$B":
562 case "\x1b&@\x1b\$(B":
563 $c = "JIS_C6226-1983";
564 break;
565 case "\x1b-A": // iso-8859-1. at least for the high code characters.
566 case "\x1b(@\x1b-A":
567 case "\x1b(B\x1b-A":
568 $c = 'ISO-8859-1';
569 break;
570 case "\x1b-B": // iso-8859-2. at least for the high code characters.
571 $c = 'ISO-8859-2';
572 break;
573 case "\x1b-C": // iso-8859-3. at least for the high code characters.
574 $c = 'ISO-8859-3';
575 break;
576 case "\x1b-D": // iso-8859-4. at least for the high code characters.
577 $c = 'ISO-8859-4';
578 break;
579 case "\x1b-E": // iso-8859-5. at least for the high code characters.
580 $c = 'ISO-8859-5';
581 break;
582 case "\x1b-F": // iso-8859-6. at least for the high code characters.
583 $c = 'ISO-8859-6';
584 break;
585 case "\x1b-G": // iso-8859-7. at least for the high code characters.
586 $c = 'ISO-8859-7';
587 break;
588 case "\x1b-H": // iso-8859-8. at least for the high code characters.
589 $c = 'ISO-8859-8';
590 break;
591 case "\x1b-I": // CSN_369103. at least for the high code characters.
592 $c = 'CSN_369103';
593 break;
594 default:
595 wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
596 //at this point just give up and refuse to parse iptc?
597 $c = false;
598 }
599 return $c;
600 }
601 }