From: ThomasV Date: Mon, 20 Apr 2009 18:00:25 +0000 (+0000) Subject: extract text layer from djvu file (see bug 18046) X-Git-Tag: 1.31.0-rc.0~42093 X-Git-Url: http://git.cyclocoop.org/%24image?a=commitdiff_plain;h=3837ab94ce2d13ecac0cd2743dbf77ea307d5b35;p=lhc%2Fweb%2Fwiklou.git extract text layer from djvu file (see bug 18046) --- diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index b8f4426ffb..c541371589 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -1897,6 +1897,11 @@ $wgDiff3 = '/usr/bin/diff3'; */ $wgDiff = '/usr/bin/diff'; +/** + * Path to the GNU sed utility. + */ +$wgSed = '/bin/sed'; + /** * We can also compress text stored in the 'text' table. If this is set on, new * revisions will be compressed on page save if zlib support is available. Any @@ -3532,6 +3537,13 @@ $wgDjvuDump = null; # $wgDjvuRenderer = 'ddjvu'; $wgDjvuRenderer = null; +/** + * Path of the djvutxt DJVU text extraction utility + * Enable this and $wgDjvuDump to enable text layer extraction from djvu files + */ +# $wgDjvuTxt = 'djvutxt'; +$wgDjvuTxt = null; + /** * Path of the djvutoxml executable * This works like djvudump except much, much slower as of version 3.5. diff --git a/includes/media/DjVu.php b/includes/media/DjVu.php index 66e954d42c..0f282c2422 100644 --- a/includes/media/DjVu.php +++ b/includes/media/DjVu.php @@ -52,6 +52,8 @@ class DjVuHandler extends ImageHandler { $m = false; if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) { return array( 'width' => $m[2], 'page' => $m[1] ); + } else if ( preg_match( '/^page(\d+)-djvutxt$/', $str, $m ) ) { + return array( 'djvutxt' => 1, 'page' => $m[1] ); } else { return false; } @@ -64,8 +66,22 @@ class DjVuHandler extends ImageHandler { ); } + + function normaliseParams( $image, &$params ) { + global $wgDjvuTxt; + if( $params['djvutxt'] && $wgDjvuTxt) { + if ( !isset( $params['page'] ) ) { + $params['page'] = 1; + } + $params['width'] = 0; + $params['height'] = 0; + return true; + } + else return parent::normaliseParams( $image, $params ); + } + function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) { - global $wgDjvuRenderer, $wgDjvuPostProcessor; + global $wgDjvuRenderer, $wgDjvuPostProcessor, $wgDjvuTxt, $wgSed; // Fetch XML and check it, to give a more informative error message than the one which // normaliseParams will inevitably give. @@ -96,12 +112,22 @@ class DjVuHandler extends ImageHandler { # Use a subshell (brackets) to aggregate stderr from both pipeline commands # before redirecting it to the overall stdout. This works in both Linux and Windows XP. - $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " . - wfEscapeShellArg( $srcPath ); - if ( $wgDjvuPostProcessor ) { - $cmd .= " | {$wgDjvuPostProcessor}"; + + if( $params['djvutxt'] && $wgDjvuTxt && $wgSed ) { + #Read text from djvu + $cmd = '(' . wfEscapeShellArg( $wgDjvuTxt ) . " --page={$page} " . wfEscapeShellArg( $srcPath ); + #Escape < > & characters + $cmd .= ' | ' . wfEscapeShellArg( $wgSed ) . ' "s/\&/\&/g ; s//\>/g ; s/\"/\"/g "'; + $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1'; + } + else { + $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " . + wfEscapeShellArg( $srcPath ); + if ( $wgDjvuPostProcessor ) { + $cmd .= " | {$wgDjvuPostProcessor}"; + } + $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1'; } - $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1'; wfProfileIn( 'ddjvu' ); wfDebug( __METHOD__.": $cmd\n" ); $err = wfShellExec( $cmd, $retval );