extract djvu text (bug 18046); escape possible script with htmlspecialchars instead...
authorThomasV <thomasv@users.mediawiki.org>
Wed, 29 Apr 2009 19:33:57 +0000 (19:33 +0000)
committerThomasV <thomasv@users.mediawiki.org>
Wed, 29 Apr 2009 19:33:57 +0000 (19:33 +0000)
includes/DefaultSettings.php
includes/media/DjVu.php

index 4d209bd..8342e46 100644 (file)
@@ -3534,6 +3534,13 @@ $wgDjvuDump = null;
 # $wgDjvuRenderer = 'ddjvu';
 $wgDjvuRenderer = null;
 
+/**
+ * Path of the djvutxt DJVU text extraction utility
+ * Enable this and $wgDjvuDump to enable text layer extraction from djvu files
+ */
+# $wgDjvuTxt = 'djvutxt';
+$wgDjvuTxt = null;
+
 /**
  * Path of the djvutoxml executable
  * This works like djvudump except much, much slower as of version 3.5.
index 66e954d..c2973f9 100644 (file)
@@ -52,6 +52,8 @@ class DjVuHandler extends ImageHandler {
                $m = false;
                if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) {
                        return array( 'width' => $m[2], 'page' => $m[1] );
+               } else if ( preg_match( '/^page(\d+)-djvutxt$/', $str, $m ) ) {
+                       return array( 'djvutxt' => 1, 'page' => $m[1] );
                } else {
                        return false;
                }
@@ -64,8 +66,21 @@ class DjVuHandler extends ImageHandler {
                );
        }
 
+       function normaliseParams( $image, &$params ) {
+               global $wgDjvuTxt;
+               if( $params['djvutxt'] && $wgDjvuTxt) {
+                       if ( !isset( $params['page'] ) ) {
+                               $params['page'] = 1;
+                       }
+                       $params['width'] = 0;
+                       $params['height'] = 0;
+                       return true;
+               } 
+               else return parent::normaliseParams( $image, $params );
+       }
+
        function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) {
-               global $wgDjvuRenderer, $wgDjvuPostProcessor;
+               global $wgDjvuRenderer, $wgDjvuPostProcessor, $wgDjvuTxt;
 
                // Fetch XML and check it, to give a more informative error message than the one which
                // normaliseParams will inevitably give.
@@ -94,18 +109,36 @@ class DjVuHandler extends ImageHandler {
                        return new MediaTransformError( 'thumbnail_error', $width, $height, wfMsg( 'thumbnail_dest_directory' ) );
                }
 
-               # Use a subshell (brackets) to aggregate stderr from both pipeline commands
-               # before redirecting it to the overall stdout. This works in both Linux and Windows XP.
-               $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " .
-                       wfEscapeShellArg( $srcPath );
-               if ( $wgDjvuPostProcessor ) {
-                       $cmd .= " | {$wgDjvuPostProcessor}";
+               if( $params['djvutxt'] && $wgDjvuTxt ) {
+                       # Extract djvu text
+                       $cmd = wfEscapeShellArg( $wgDjvuTxt ) . " --page={$page} " . wfEscapeShellArg( $srcPath ) ;
+                       wfProfileIn( 'djvutxt' );
+                       wfDebug( __METHOD__.": $cmd\n" );
+                       $err = wfShellExec( $cmd, $retval );
+                       wfProfileOut( 'djvutxt' );
+                       # Escape html characters
+                       $txt = htmlspecialchars( $err );
+                       # Write result to file
+                       if($retval == 0) {
+                               $f = fopen($dstPath, 'w');
+                               fwrite($f, $txt);
+                               fclose($f);
+                       }
+               }
+               else {
+                       # Use a subshell (brackets) to aggregate stderr from both pipeline commands
+                       # before redirecting it to the overall stdout. This works in both Linux and Windows XP.
+                       $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " .
+                               wfEscapeShellArg( $srcPath );
+                       if ( $wgDjvuPostProcessor ) {
+                               $cmd .= " | {$wgDjvuPostProcessor}";
+                       }
+                       $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1';
+                       wfProfileIn( 'ddjvu' );
+                       wfDebug( __METHOD__.": $cmd\n" );
+                       $err = wfShellExec( $cmd, $retval );
+                       wfProfileOut( 'ddjvu' );
                }
-               $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1';
-               wfProfileIn( 'ddjvu' );
-               wfDebug( __METHOD__.": $cmd\n" );
-               $err = wfShellExec( $cmd, $retval );
-               wfProfileOut( 'ddjvu' );
 
                $removed = $this->removeBadFile( $dstPath, $retval );
                if ( $retval != 0 || $removed ) {