Merge pull request #1282 from mwoodiupui/DS-2159

[DS-2159] Deprecate XPDF in DSpace 5, remove in DSpace 6
This commit is contained in:
Mark H. Wood
2016-02-17 14:53:54 -05:00
4 changed files with 1 additions and 711 deletions

View File

@@ -1,166 +0,0 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import org.apache.log4j.Logger;
import org.dspace.content.Item;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Utils;
/**
* Text MediaFilter for PDF sources
*
* This filter produces extracted text suitable for building an index,
* but not for display to end users.
* It forks a process running the "pdftotext" program from the
* XPdf suite -- see http://www.foolabs.com/xpdf/
* This is a suite of open-source PDF tools that has been widely ported
* to Unix platforms and the ones we use (pdftoppm, pdftotext) even
* run on Win32.
*
* This was written for the FACADE project but it is not directly connected
* to any of the other FACADE-specific software. The FACADE UI expects
* to find thumbnail images for 3D PDFs generated by this filter.
*
* Requires DSpace config properties keys:
*
* xpdf.path.pdftotext -- path to "pdftotext" executable (required!)
*
* @author Larry Stone
* @see org.dspace.app.mediafilter.MediaFilter
*/
public class XPDF2Text extends MediaFilter
{
private static Logger log = Logger.getLogger(XPDF2Text.class);
// Command to get text from pdf; @infile@, @COMMAND@ are placeholders
protected static final String XPDF_PDFTOTEXT_COMMAND[] =
{
"@COMMAND@", "-q", "-enc", "UTF-8", "@infile@", "-"
};
// executable path that comes from DSpace config at runtime.
private String pdftotextPath = null;
@Override
public String getFilteredName(String oldFilename)
{
return oldFilename + ".txt";
}
@Override
public String getBundleName()
{
return "TEXT";
}
@Override
public String getFormatString()
{
return "Text";
}
@Override
public String getDescription()
{
return "Extracted Text";
}
@Override
public InputStream getDestinationStream(Item currentItem, InputStream sourceStream, boolean verbose)
throws Exception
{
// get configured value for path to XPDF command:
if (pdftotextPath == null)
{
pdftotextPath = ConfigurationManager.getProperty("xpdf.path.pdftotext");
if (pdftotextPath == null)
{
throw new IllegalStateException("No value for key \"xpdf.path.pdftotext\" in DSpace configuration! Should be path to XPDF pdftotext executable.");
}
}
File sourceTmp = File.createTempFile("DSfilt",".pdf");
sourceTmp.deleteOnExit(); // extra insurance, we'll delete it here.
int status = -1;
try
{
// make local temp copy of source PDF since PDF tools
// require a file for random access.
// XXX fixme could optimize if we ever get an interface to grab asset *files*
OutputStream sto = new FileOutputStream(sourceTmp);
Utils.copy(sourceStream, sto);
sto.close();
sourceStream.close();
String pdfCmd[] = XPDF_PDFTOTEXT_COMMAND.clone();
pdfCmd[0] = pdftotextPath;
pdfCmd[4] = sourceTmp.toString();
log.debug("Running command: "+Arrays.deepToString(pdfCmd));
Process pdfProc = Runtime.getRuntime().exec(pdfCmd);
InputStream stdout = pdfProc.getInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Utils.copy(new BufferedInputStream(stdout), baos);
stdout.close();
baos.close();
status = pdfProc.waitFor();
String msg = null;
if (status == 1)
{
msg = "pdftotext failed opening input: file=" + sourceTmp.toString();
}
else if (status == 3)
{
msg = "pdftotext permission failure (perhaps copying of text from this document is not allowed - check PDF file's internal permissions): file=" + sourceTmp.toString();
}
else if (status != 0)
{
msg = "pdftotext failed, maybe corrupt PDF? status=" + String.valueOf(status);
}
if (msg != null)
{
log.error(msg);
throw new IOException(msg);
}
return new ByteArrayInputStream(baos.toByteArray());
}
catch (InterruptedException e)
{
log.error("Failed in pdftotext subprocess: ",e);
throw e;
}
finally
{
if (!sourceTmp.delete())
{
log.error("Unable to delete temporary file");
}
if (status != 0)
{
log.error("PDF conversion proc failed, returns=" + status + ", file=" + sourceTmp);
}
}
}
}

View File

@@ -1,520 +0,0 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;
import java.awt.Graphics2D;
import java.awt.Color;
import java.awt.image.*;
import java.awt.RenderingHints;
import java.awt.Transparency;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.log4j.Logger;
import org.dspace.content.Item;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Utils;
/**
* Thumbnail MediaFilter for PDF sources
*
* This filter generates thumbnail images for PDF documents, _including_
* 3D PDF documents with 2D "poster" images. Since the PDFBox library
* does not understand these, and fails to render a lot of other PDFs,
* this filter forks a process running the "pdftoppm" program from the
* XPdf suite -- see http://www.foolabs.com/xpdf/
* This is a suite of open-source PDF tools that has been widely ported
* to Unix platforms and the ones we use (pdftoppm, pdfinfo) even
* run on Win32.
*
* This was written for the FACADE project but it is not directly connected
* to any of the other FACADE-specific software. The FACADE UI expects
* to find thumbnail images for 3D PDFs generated by this filter.
*
* Requires DSpace config properties keys:
*
* xpdf.path.pdftoppm -- absolute path to "pdftoppm" executable (required!)
* xpdf.path.pdfinfo -- absolute path to "pdfinfo" executable (required!)
* thumbnail.maxwidth -- borrowed from thumbnails, max dim of generated image
*
* @author Larry Stone
* @see org.dspace.app.mediafilter.MediaFilter
*/
public class XPDF2Thumbnail extends MediaFilter
{
private static Logger log = Logger.getLogger(XPDF2Thumbnail.class);
// maximum size of either preview image dimension
protected static final int MAX_PX = 800;
// maxium DPI - use common screen res, 100dpi.
protected static final int MAX_DPI = 100;
// command to get image from PDF; @FILE@, @OUTPUT@ are placeholders
protected static final String XPDF_PDFTOPPM_COMMAND[] =
{
"@COMMAND@", "-q", "-f", "1", "-l", "1",
"-r", "@DPI@", "@FILE@", "@OUTPUTFILE@"
};
// command to get image from PDF; @FILE@, @OUTPUT@ are placeholders
protected static final String XPDF_PDFINFO_COMMAND[] =
{
"@COMMAND@", "-f", "1", "-l", "1", "-box", "@FILE@"
};
// executable path for "pdftoppm", comes from DSpace config at runtime.
protected String pdftoppmPath = null;
// executable path for "pdfinfo", comes from DSpace config at runtime.
protected String pdfinfoPath = null;
// match line in pdfinfo output that describes file's MediaBox
protected static final Pattern MEDIABOX_PATT = Pattern.compile(
"^Page\\s+\\d+\\s+MediaBox:\\s+([\\.\\d-]+)\\s+([\\.\\d-]+)\\s+([\\.\\d-]+)\\s+([\\.\\d-]+)");
// also from thumbnail.maxwidth in config
protected int xmax = 0;
// backup default for size, on the large side.
protected static final int DEFAULT_XMAX = 500;
@Override
public String getFilteredName(String oldFilename)
{
return oldFilename + ".jpg";
}
@Override
public String getBundleName()
{
return "THUMBNAIL";
}
@Override
public String getFormatString()
{
return "JPEG";
}
@Override
public String getDescription()
{
return "Generated Thumbnail";
}
// canonical MediaFilter method to generate the thumbnail as stream.
@Override
public InputStream getDestinationStream(Item currentItem, InputStream sourceStream, boolean verbose)
throws Exception
{
// get config params
float xmax = (float) ConfigurationManager
.getIntProperty("thumbnail.maxwidth");
float ymax = (float) ConfigurationManager
.getIntProperty("thumbnail.maxheight");
boolean blurring = (boolean) ConfigurationManager
.getBooleanProperty("thumbnail.blurring");
boolean hqscaling = (boolean) ConfigurationManager
.getBooleanProperty("thumbnail.hqscaling");
// sanity check: xpdf paths are required. can cache since it won't change
if (pdftoppmPath == null || pdfinfoPath == null)
{
pdftoppmPath = ConfigurationManager.getProperty("xpdf.path.pdftoppm");
pdfinfoPath = ConfigurationManager.getProperty("xpdf.path.pdfinfo");
if (pdftoppmPath == null)
{
throw new IllegalStateException("No value for key \"xpdf.path.pdftoppm\" in DSpace configuration! Should be path to XPDF pdftoppm executable.");
}
if (pdfinfoPath == null)
{
throw new IllegalStateException("No value for key \"xpdf.path.pdfinfo\" in DSpace configuration! Should be path to XPDF pdfinfo executable.");
}
if (xmax == 0)
{
xmax = DEFAULT_XMAX;
}
}
// make local file copy of source PDF since the PDF tools
// require a file for random access.
// XXX fixme would be nice to optimize this if we ever get
// XXX a DSpace method to access (optionally!) the _file_ of
// a Bitstream in the asset store, only when there is one of course.
File sourceTmp = File.createTempFile("DSfilt",".pdf");
sourceTmp.deleteOnExit();
int status = 0;
BufferedImage source = null;
try
{
OutputStream sto = new FileOutputStream(sourceTmp);
Utils.copy(sourceStream, sto);
sto.close();
sourceStream.close();
// First get max physical dim of bounding box of first page
// to compute the DPI to ask for.. otherwise some AutoCAD
// drawings can produce enormous files even at 75dpi, for
// 48" drawings..
// run pdfinfo, look for MediaBox description in the output, e.g.
// "Page 1 MediaBox: 0.00 0.00 612.00 792.00"
//
int dpi = 0;
String pdfinfoCmd[] = XPDF_PDFINFO_COMMAND.clone();
pdfinfoCmd[0] = pdfinfoPath;
pdfinfoCmd[pdfinfoCmd.length-1] = sourceTmp.toString();
BufferedReader lr = null;
try
{
MatchResult mediaBox = null;
Process pdfProc = Runtime.getRuntime().exec(pdfinfoCmd);
lr = new BufferedReader(new InputStreamReader(pdfProc.getInputStream()));
String line;
for (line = lr.readLine(); line != null; line = lr.readLine())
{
// if (line.matches(MEDIABOX_PATT))
Matcher mm = MEDIABOX_PATT.matcher(line);
if (mm.matches())
{
mediaBox = mm.toMatchResult();
}
}
int istatus = pdfProc.waitFor();
if (istatus != 0)
{
log.error("XPDF pdfinfo proc failed, exit status=" + istatus + ", file=" + sourceTmp);
}
if (mediaBox == null)
{
log.error("Sanity check: Did not find \"MediaBox\" line in output of XPDF pdfinfo, file="+sourceTmp);
throw new IllegalArgumentException("Failed to get MediaBox of PDF with pdfinfo, cannot compute thumbnail.");
}
else
{
double x0 = Double.parseDouble(mediaBox.group(1));
double y0 = Double.parseDouble(mediaBox.group(2));
double x1 = Double.parseDouble(mediaBox.group(3));
double y1 = Double.parseDouble(mediaBox.group(4));
int maxdim = (int)Math.max(Math.abs(x1 - x0), Math.abs(y1 - y0));
dpi = Math.min(MAX_DPI, (MAX_PX * 72 / maxdim));
log.debug("DPI: pdfinfo method got dpi="+dpi+" for max dim="+maxdim+" (points, 1/72\")");
}
}
catch (InterruptedException e)
{
log.error("Failed transforming file for preview: ",e);
throw new IllegalArgumentException("Failed transforming file for thumbnail: ",e);
}
catch (NumberFormatException e)
{
log.error("Failed interpreting pdfinfo results, check regexp: ",e);
throw new IllegalArgumentException("Failed transforming file for thumbnail: ",e);
}
finally
{
if (lr != null)
{
lr.close();
}
}
// Render page 1 using xpdf's pdftoppm
// Requires Sun JAI imageio additions to read ppm directly.
// this will get "-000001.ppm" appended to it by pdftoppm
File outPrefixF = File.createTempFile("prevu","out");
String outPrefix = outPrefixF.toString();
if (!outPrefixF.delete())
{
log.error("Unable to delete output file");
}
String pdfCmd[] = XPDF_PDFTOPPM_COMMAND.clone();
pdfCmd[0] = pdftoppmPath;
pdfCmd[pdfCmd.length-3] = String.valueOf(dpi);
pdfCmd[pdfCmd.length-2] = sourceTmp.toString();
pdfCmd[pdfCmd.length-1] = outPrefix;
File outf = new File(outPrefix+"-000001.ppm");
log.debug("Running xpdf command: "+Arrays.deepToString(pdfCmd));
try
{
Process pdfProc = Runtime.getRuntime().exec(pdfCmd);
status = pdfProc.waitFor();
if (!outf.exists()) outf = new File(outPrefix+"-00001.ppm");
if (!outf.exists()) outf = new File(outPrefix+"-0001.ppm");
if (!outf.exists()) outf = new File(outPrefix+"-001.ppm");
if (!outf.exists()) outf = new File(outPrefix+"-01.ppm");
if (!outf.exists()) outf = new File(outPrefix+"-1.ppm");
log.debug("PDFTOPPM output is: "+outf+", exists="+outf.exists());
source = ImageIO.read(outf);
}
catch (InterruptedException e)
{
log.error("Failed transforming file for preview: ",e);
throw new IllegalArgumentException("Failed transforming file for preview: ",e);
}
finally
{
if (!outf.delete())
{
log.error("Unable to delete file");
}
}
}
finally
{
if (!sourceTmp.delete())
{
log.error("Unable to delete temporary source");
}
if (status != 0)
{
log.error("PDF conversion proc failed, exit status=" + status + ", file=" + sourceTmp);
}
}
if (source == null)
{
throw new IOException("Unknown failure while transforming file to preview: no image produced.");
}
// read in bitstream's image
BufferedImage buf = source;
// now get the image dimensions
float xsize = (float) buf.getWidth(null);
float ysize = (float) buf.getHeight(null);
// if verbose flag is set, print out dimensions
// to STDOUT
if (verbose)
{
System.out.println("original size: " + xsize + "," + ysize);
}
// scale by x first if needed
if (xsize > xmax)
{
// calculate scaling factor so that xsize * scale = new size (max)
float scale_factor = xmax / xsize;
// if verbose flag is set, print out extracted text
// to STDOUT
if (verbose)
{
System.out.println("x scale factor: " + scale_factor);
}
// now reduce x size
// and y size
xsize = xsize * scale_factor;
ysize = ysize * scale_factor;
// if verbose flag is set, print out extracted text
// to STDOUT
if (verbose)
{
System.out.println("new size: " + xsize + "," + ysize);
}
}
// scale by y if needed
if (ysize > ymax)
{
float scale_factor = ymax / ysize;
// now reduce x size
// and y size
xsize = xsize * scale_factor;
ysize = ysize * scale_factor;
}
// if verbose flag is set, print details to STDOUT
if (verbose)
{
System.out.println("created thumbnail size: " + xsize + ", "
+ ysize);
}
// create an image buffer for the thumbnail with the new xsize, ysize
BufferedImage thumbnail = new BufferedImage((int) xsize, (int) ysize,
BufferedImage.TYPE_INT_RGB);
// Use blurring if selected in config.
// a little blur before scaling does wonders for keeping moire in check.
if (blurring)
{
// send the buffered image off to get blurred.
buf = getBlurredInstance((BufferedImage) buf);
}
// Use high quality scaling method if selected in config.
// this has a definite performance penalty.
if (hqscaling)
{
// send the buffered image off to get an HQ downscale.
buf = getScaledInstance((BufferedImage) buf, (int) xsize, (int) ysize,
(Object) RenderingHints.VALUE_INTERPOLATION_BICUBIC, (boolean) true);
}
// now render the image into the thumbnail buffer
Graphics2D g2d = thumbnail.createGraphics();
g2d.drawImage(buf, 0, 0, (int) xsize, (int) ysize, null);
// now create an input stream for the thumbnail buffer and return it
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ImageIO.write(thumbnail, "jpeg", baos);
// now get the array
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
return bais; // hope this gets written out before its garbage collected!
}
public String[] getInputMIMETypes()
{
return ImageIO.getReaderMIMETypes();
}
public String[] getInputDescriptions()
{
return null;
}
public String[] getInputExtensions()
{
// Temporarily disabled as JDK 1.6 only
// return ImageIO.getReaderFileSuffixes();
return null;
}
public BufferedImage getNormalizedInstance(BufferedImage buf)
{
int type = (buf.getTransparency() == Transparency.OPAQUE) ?
BufferedImage.TYPE_INT_RGB : BufferedImage.TYPE_INT_ARGB_PRE;
int w, h;
w = buf.getWidth();
h = buf.getHeight();
BufferedImage normal = new BufferedImage(w, h, type);
Graphics2D g2d = normal.createGraphics();
g2d.drawImage(buf, 0, 0, w, h, Color.WHITE, null);
g2d.dispose();
return normal;
}
public BufferedImage getBlurredInstance(BufferedImage buf)
{
/**
* Convenience method that returns a blurred instance of the
* provided {@code BufferedImage}.
*
*/
buf = getNormalizedInstance(buf);
// kernel for blur op
float[] matrix = {
0.111f, 0.111f, 0.111f,
0.111f, 0.111f, 0.111f,
0.111f, 0.111f, 0.111f,
};
// perform the blur and return the blurred version.
BufferedImageOp blur = new ConvolveOp( new Kernel(3, 3, matrix) );
BufferedImage blurbuf = blur.filter(buf, null);
return blurbuf;
}
/**
* Convenience method that returns a scaled instance of the
* provided {@code BufferedImage}.
*
* @param buf the original image to be scaled
* @param targetWidth the desired width of the scaled instance,
* in pixels
* @param targetHeight the desired height of the scaled instance,
* in pixels
* @param hint one of the rendering hints that corresponds to
* {@code RenderingHints.KEY_INTERPOLATION} (e.g.
* {@code RenderingHints.VALUE_INTERPOLATION_NEAREST_NEIGHBOR},
* {@code RenderingHints.VALUE_INTERPOLATION_BILINEAR},
* {@code RenderingHints.VALUE_INTERPOLATION_BICUBIC})
* @param higherQuality if true, this method will use a multi-step
* scaling technique that provides higher quality than the usual
* one-step technique (only useful in downscaling cases, where
* {@code targetWidth} or {@code targetHeight} is
* smaller than the original dimensions, and generally only when
* the {@code BILINEAR} hint is specified)
* @return a scaled version of the original {@code BufferedImage}
*/
public BufferedImage getScaledInstance(BufferedImage buf,
int targetWidth,
int targetHeight,
Object hint,
boolean higherQuality)
{
int type = (buf.getTransparency() == Transparency.OPAQUE) ?
BufferedImage.TYPE_INT_RGB : BufferedImage.TYPE_INT_ARGB;
BufferedImage scalebuf = (BufferedImage)buf;
int w, h;
if (higherQuality) {
// Use multi-step technique: start with original size, then
// scale down in multiple passes with drawImage()
// until the target size is reached
w = buf.getWidth();
h = buf.getHeight();
} else {
// Use one-step technique: scale directly from original
// size to target size with a single drawImage() call
w = targetWidth;
h = targetHeight;
}
do {
if (higherQuality && w > targetWidth) {
w /= 2;
if (w < targetWidth) {
w = targetWidth;
}
}
if (higherQuality && h > targetHeight) {
h /= 2;
if (h < targetHeight) {
h = targetHeight;
}
}
BufferedImage tmp = new BufferedImage(w, h, type);
Graphics2D g2d = tmp.createGraphics();
g2d.setRenderingHint(RenderingHints.KEY_INTERPOLATION, hint);
g2d.drawImage(scalebuf, 0, 0, w, h, Color.WHITE, null);
g2d.dispose();
scalebuf = tmp;
} while (w != targetWidth || h != targetHeight);
return scalebuf;
}
}

View File

@@ -388,7 +388,7 @@ filter.org.dspace.app.mediafilter.ExcelFilter.inputFormats = Microsoft Excel, Mi
#Publicly accessible thumbnails of restricted content.
#List the MediaFilter name's that would get publicly accessible permissions
#Any media filters not listed will instead inherit the permissions of the parent bitstream
#filter.org.dspace.app.mediafilter.publicPermission = JPEGFilter, XPDF2Thumbnail
#filter.org.dspace.app.mediafilter.publicPermission = JPEGFilter
#Custom settings for PDFFilter
# If true, all PDF extractions are written to temp files as they are indexed...this

View File

@@ -42,30 +42,6 @@
</dependency>
</dependencies>
</profile>
<!--
Using xpdf filters requires the inclusion
Of Sun dependencies these are not redistributable
and you will need to install them locally in your
maven repository prior to building your dspace instance
-->
<profile>
<id>xpdf-mediafilter-support</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>com.sun.media</groupId>
<artifactId>jai_imageio</artifactId>
</dependency>
<dependency>
<groupId>javax.media</groupId>
<artifactId>jai_core</artifactId>
</dependency>
</dependencies>
</profile>
</profiles>
<!--