From: ThomasV Date: Wed, 29 Apr 2009 19:33:57 +0000 (+0000) Subject: extract djvu text (bug 18046); escape possible script with htmlspecialchars instead... X-Git-Tag: 1.31.0-rc.0~41936 X-Git-Url: http://git.cyclocoop.org/%24self?a=commitdiff_plain;h=7a605cce46eb688dfb697487a1b1d300bef4361c;p=lhc%2Fweb%2Fwiklou.git extract djvu text (bug 18046); escape possible script with htmlspecialchars instead of sed --- diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 4d209bdf25..8342e46908 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -3534,6 +3534,13 @@ $wgDjvuDump = null; # $wgDjvuRenderer = 'ddjvu'; $wgDjvuRenderer = null; +/** + * Path of the djvutxt DJVU text extraction utility + * Enable this and $wgDjvuDump to enable text layer extraction from djvu files + */ +# $wgDjvuTxt = 'djvutxt'; +$wgDjvuTxt = null; + /** * Path of the djvutoxml executable * This works like djvudump except much, much slower as of version 3.5. diff --git a/includes/media/DjVu.php b/includes/media/DjVu.php index 66e954d42c..c2973f9354 100644 --- a/includes/media/DjVu.php +++ b/includes/media/DjVu.php @@ -52,6 +52,8 @@ class DjVuHandler extends ImageHandler { $m = false; if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) { return array( 'width' => $m[2], 'page' => $m[1] ); + } else if ( preg_match( '/^page(\d+)-djvutxt$/', $str, $m ) ) { + return array( 'djvutxt' => 1, 'page' => $m[1] ); } else { return false; } @@ -64,8 +66,21 @@ class DjVuHandler extends ImageHandler { ); } + function normaliseParams( $image, &$params ) { + global $wgDjvuTxt; + if( $params['djvutxt'] && $wgDjvuTxt) { + if ( !isset( $params['page'] ) ) { + $params['page'] = 1; + } + $params['width'] = 0; + $params['height'] = 0; + return true; + } + else return parent::normaliseParams( $image, $params ); + } + function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) { - global $wgDjvuRenderer, $wgDjvuPostProcessor; + global $wgDjvuRenderer, $wgDjvuPostProcessor, $wgDjvuTxt; // Fetch XML and check it, to give a more informative error message than the one which // normaliseParams will inevitably give. @@ -94,18 +109,36 @@ class DjVuHandler extends ImageHandler { return new MediaTransformError( 'thumbnail_error', $width, $height, wfMsg( 'thumbnail_dest_directory' ) ); } - # Use a subshell (brackets) to aggregate stderr from both pipeline commands - # before redirecting it to the overall stdout. This works in both Linux and Windows XP. - $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " . - wfEscapeShellArg( $srcPath ); - if ( $wgDjvuPostProcessor ) { - $cmd .= " | {$wgDjvuPostProcessor}"; + if( $params['djvutxt'] && $wgDjvuTxt ) { + # Extract djvu text + $cmd = wfEscapeShellArg( $wgDjvuTxt ) . " --page={$page} " . wfEscapeShellArg( $srcPath ) ; + wfProfileIn( 'djvutxt' ); + wfDebug( __METHOD__.": $cmd\n" ); + $err = wfShellExec( $cmd, $retval ); + wfProfileOut( 'djvutxt' ); + # Escape html characters + $txt = htmlspecialchars( $err ); + # Write result to file + if($retval == 0) { + $f = fopen($dstPath, 'w'); + fwrite($f, $txt); + fclose($f); + } + } + else { + # Use a subshell (brackets) to aggregate stderr from both pipeline commands + # before redirecting it to the overall stdout. This works in both Linux and Windows XP. + $cmd = '(' . wfEscapeShellArg( $wgDjvuRenderer ) . " -format=ppm -page={$page} -size={$width}x{$height} " . + wfEscapeShellArg( $srcPath ); + if ( $wgDjvuPostProcessor ) { + $cmd .= " | {$wgDjvuPostProcessor}"; + } + $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1'; + wfProfileIn( 'ddjvu' ); + wfDebug( __METHOD__.": $cmd\n" ); + $err = wfShellExec( $cmd, $retval ); + wfProfileOut( 'ddjvu' ); } - $cmd .= ' > ' . wfEscapeShellArg($dstPath) . ') 2>&1'; - wfProfileIn( 'ddjvu' ); - wfDebug( __METHOD__.": $cmd\n" ); - $err = wfShellExec( $cmd, $retval ); - wfProfileOut( 'ddjvu' ); $removed = $this->removeBadFile( $dstPath, $retval ); if ( $retval != 0 || $removed ) {