dspace-api: tell ImageMagick about the PDF CropBox

ImageMagick uses the MediaBox by default when rasterizing PDFs be-
cause the PDF specification says that all PDFs *must* contain one.
This page box is the parent for all other boxes that a PDF *may*
contain, for example a CropBox, ArtBox, etc. In many cases these
are the same, but when they are not the CropBox is used to define
the area displayed to a user when they open the PDF on screen (as
opposed to when printing on paper).

If a PDF has a CropBox that is different to its MediaBox then we
should tell ImageMagick to use it.

Fixes: https://github.com/DSpace/DSpace/issues/8549
This commit is contained in:
Alan Orth
2022-11-27 13:27:50 +03:00
committed by Tim Donohue
parent 705f87a826
commit e6693f4232

View File

@@ -14,6 +14,9 @@ import java.io.InputStream;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.dspace.content.Bitstream; import org.dspace.content.Bitstream;
import org.dspace.content.Bundle; import org.dspace.content.Bundle;
import org.dspace.content.Item; import org.dspace.content.Item;
@@ -132,6 +135,26 @@ public abstract class ImageMagickThumbnailFilter extends MediaFilter {
op.density(Integer.valueOf(density)); op.density(Integer.valueOf(density));
} }
// Check the PDF's MediaBox and CropBox to see if they are the same.
// If not, then tell ImageMagick to use the CropBox when generating
// the thumbnail because the CropBox is generally used to define the
// area displayed when a user opens the PDF on a screen, whereas the
// MediaBox is used for print. Not all PDFs set these correctly, so
// we can use ImageMagick's default behavior unless we see an explit
// CropBox. Note: we don't need to do anything special to detect if
// the CropBox is missing or empty because pdfbox will set it to the
// same size as the MediaBox if it doesn't exist. Also note that we
// only need to check the first page, since that's what we use for
// generating the thumbnail (PDDocument uses a zero-based index).
PDPage pdfPage = PDDocument.load(f).getPage(0);
PDRectangle pdfPageMediaBox = pdfPage.getMediaBox();
PDRectangle pdfPageCropBox = pdfPage.getCropBox();
// This option must come *before* we open the input file.
if (pdfPageCropBox != pdfPageMediaBox) {
op.define("pdf:use-cropbox=true");
}
String s = "[" + page + "]"; String s = "[" + page + "]";
op.addImage(f.getAbsolutePath() + s); op.addImage(f.getAbsolutePath() + s);
if (configurationService.getBooleanProperty(PRE + ".flatten", true)) { if (configurationService.getBooleanProperty(PRE + ".flatten", true)) {