From 8ef37c2cee829e6aa5280cee5310e1e4af93f77e Mon Sep 17 00:00:00 2001 From: "Mark H. Wood" Date: Wed, 11 Jun 2025 15:43:10 -0400 Subject: [PATCH] Also apply max-array when using temp files; cleanups. --- .../app/mediafilter/TikaTextExtractionFilter.java | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java index 0e3a930e32..fd53fc3870 100644 --- a/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/TikaTextExtractionFilter.java @@ -38,6 +38,8 @@ import org.xml.sax.SAXException; public class TikaTextExtractionFilter extends MediaFilter { private final static Logger log = LogManager.getLogger(); + private static final int DEFAULT_MAX_CHARS = 100_000; + private static final int DEFAULT_MAX_ARRAY = 1_000_000; @Override public String getFilteredName(String oldFilename) { @@ -71,15 +73,16 @@ public class TikaTextExtractionFilter } // Not using temporary file. We'll use Tika's default in-memory parsing. - // Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting. String extractedText; - int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100_000); + // Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting. + int maxChars = configurationService.getIntProperty("textextractor.max-chars", DEFAULT_MAX_CHARS); + // Get maximum size of structure that Tika will try to buffer. + int maxArray = configurationService.getIntProperty("textextractor.max-array", DEFAULT_MAX_ARRAY); + IOUtils.setByteArrayMaxOverride(maxArray); try { // Use Tika to extract text from input. Tika will automatically detect the file type. Tika tika = new Tika(); tika.setMaxStringLength(maxChars); // Tell Tika the maximum number of characters to extract - IOUtils.setByteArrayMaxOverride( - configurationService.getIntProperty("textextractor.max-array", 100_000_000)); extractedText = tika.parseToString(source); } catch (IOException e) { System.err.format("Unable to extract text from bitstream in Item %s%n", currentItem.getID().toString()); @@ -170,6 +173,10 @@ public class TikaTextExtractionFilter } }); + ConfigurationService configurationService = DSpaceServicesFactory.getInstance().getConfigurationService(); + int maxArray = configurationService.getIntProperty("textextractor.max-array", DEFAULT_MAX_ARRAY); + IOUtils.setByteArrayMaxOverride(maxArray); + AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); // parse our source InputStream using the above custom handler