Also apply max-array when using temp files; cleanups.

This commit is contained in:
Mark H. Wood
2025-06-11 15:43:10 -04:00
parent 414ee7813a
commit 8ef37c2cee

View File

@@ -38,6 +38,8 @@ import org.xml.sax.SAXException;
public class TikaTextExtractionFilter public class TikaTextExtractionFilter
extends MediaFilter { extends MediaFilter {
private final static Logger log = LogManager.getLogger(); private final static Logger log = LogManager.getLogger();
private static final int DEFAULT_MAX_CHARS = 100_000;
private static final int DEFAULT_MAX_ARRAY = 1_000_000;
@Override @Override
public String getFilteredName(String oldFilename) { public String getFilteredName(String oldFilename) {
@@ -71,15 +73,16 @@ public class TikaTextExtractionFilter
} }
// Not using temporary file. We'll use Tika's default in-memory parsing. // Not using temporary file. We'll use Tika's default in-memory parsing.
// Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting.
String extractedText; String extractedText;
int maxChars = configurationService.getIntProperty("textextractor.max-chars", 100_000); // Get maximum characters to extract. Default is 100,000 chars, which is also Tika's default setting.
int maxChars = configurationService.getIntProperty("textextractor.max-chars", DEFAULT_MAX_CHARS);
// Get maximum size of structure that Tika will try to buffer.
int maxArray = configurationService.getIntProperty("textextractor.max-array", DEFAULT_MAX_ARRAY);
IOUtils.setByteArrayMaxOverride(maxArray);
try { try {
// Use Tika to extract text from input. Tika will automatically detect the file type. // Use Tika to extract text from input. Tika will automatically detect the file type.
Tika tika = new Tika(); Tika tika = new Tika();
tika.setMaxStringLength(maxChars); // Tell Tika the maximum number of characters to extract tika.setMaxStringLength(maxChars); // Tell Tika the maximum number of characters to extract
IOUtils.setByteArrayMaxOverride(
configurationService.getIntProperty("textextractor.max-array", 100_000_000));
extractedText = tika.parseToString(source); extractedText = tika.parseToString(source);
} catch (IOException e) { } catch (IOException e) {
System.err.format("Unable to extract text from bitstream in Item %s%n", currentItem.getID().toString()); System.err.format("Unable to extract text from bitstream in Item %s%n", currentItem.getID().toString());
@@ -170,6 +173,10 @@ public class TikaTextExtractionFilter
} }
}); });
ConfigurationService configurationService = DSpaceServicesFactory.getInstance().getConfigurationService();
int maxArray = configurationService.getIntProperty("textextractor.max-array", DEFAULT_MAX_ARRAY);
IOUtils.setByteArrayMaxOverride(maxArray);
AutoDetectParser parser = new AutoDetectParser(); AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
// parse our source InputStream using the above custom handler // parse our source InputStream using the above custom handler