mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 01:54:22 +00:00
Merge pull request #10895 from mwoodiupui/9733-8
[Port dspace-8_x] filter-media: make POI record buffer size adjustable.
This commit is contained in:
@@ -18,6 +18,7 @@ import java.nio.charset.StandardCharsets;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@@ -37,6 +38,8 @@ import org.xml.sax.SAXException;
|
||||
public class TikaTextExtractionFilter
|
||||
extends MediaFilter {
|
||||
private final static Logger log = LogManager.getLogger();
|
||||
private static final int DEFAULT_MAX_CHARS = 100_000;
|
||||
private static final int DEFAULT_MAX_ARRAY = 100_000_000;
|
||||
|
||||
@Override
|
||||
public String getFilteredName(String oldFilename) {
|
||||
@@ -70,9 +73,12 @@ public class TikaTextExtractionFilter
|
||||
}
|
||||
|
||||
// Not using temporary file. We'll use Tika's default in-memory parsing.
|
||||
// Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting.
|
||||
String extractedText;
|
||||
int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100000);
|
||||
// Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting.
|
||||
int maxChars = configurationService.getIntProperty("textextractor.max-chars", DEFAULT_MAX_CHARS);
|
||||
// Get maximum size of structure that Tika will try to buffer.
|
||||
int maxArray = configurationService.getIntProperty("textextractor.max-array", DEFAULT_MAX_ARRAY);
|
||||
IOUtils.setByteArrayMaxOverride(maxArray);
|
||||
try {
|
||||
// Use Tika to extract text from input. Tika will automatically detect the file type.
|
||||
Tika tika = new Tika();
|
||||
@@ -80,13 +86,13 @@ public class TikaTextExtractionFilter
|
||||
extractedText = tika.parseToString(source);
|
||||
} catch (IOException e) {
|
||||
System.err.format("Unable to extract text from bitstream in Item %s%n", currentItem.getID().toString());
|
||||
e.printStackTrace();
|
||||
e.printStackTrace(System.err);
|
||||
log.error("Unable to extract text from bitstream in Item {}", currentItem.getID().toString(), e);
|
||||
throw e;
|
||||
} catch (OutOfMemoryError oe) {
|
||||
System.err.format("OutOfMemoryError occurred when extracting text from bitstream in Item %s. " +
|
||||
"You may wish to enable 'textextractor.use-temp-file'.%n", currentItem.getID().toString());
|
||||
oe.printStackTrace();
|
||||
oe.printStackTrace(System.err);
|
||||
log.error("OutOfMemoryError occurred when extracting text from bitstream in Item {}. " +
|
||||
"You may wish to enable 'textextractor.use-temp-file'.", currentItem.getID().toString(), oe);
|
||||
throw oe;
|
||||
@@ -167,6 +173,10 @@ public class TikaTextExtractionFilter
|
||||
}
|
||||
});
|
||||
|
||||
ConfigurationService configurationService = DSpaceServicesFactory.getInstance().getConfigurationService();
|
||||
int maxArray = configurationService.getIntProperty("textextractor.max-array", DEFAULT_MAX_ARRAY);
|
||||
IOUtils.setByteArrayMaxOverride(maxArray);
|
||||
|
||||
AutoDetectParser parser = new AutoDetectParser();
|
||||
Metadata metadata = new Metadata();
|
||||
// parse our source InputStream using the above custom handler
|
||||
|
@@ -523,6 +523,13 @@ filter.org.dspace.app.mediafilter.PDFBoxThumbnail.inputFormats = Adobe PDF
|
||||
# text ("filter-media -f" ) and then reindex your site ("index-discovery -b").
|
||||
#textextractor.use-temp-file = false
|
||||
|
||||
# Maximum size of a record buffer for text extraction. Set this if you are
|
||||
# seeing RecordFormatException calling out excessive array length from
|
||||
# 'dspace filter-media'. It is likely that you will need to increase the
|
||||
# size of the Java heap if you greatly increase this value -- see JAVA_OPTS
|
||||
# in 'bin/dspace' or 'bin/dspace/bat'.
|
||||
#textextractor.max-array = 1000000
|
||||
|
||||
# Custom settigns for ImageMagick Thumbnail Filters
|
||||
# ImageMagick and GhostScript must be installed on the server, set the path to ImageMagick and GhostScript executable
|
||||
# http://www.imagemagick.org/
|
||||
|
Reference in New Issue
Block a user