Merge pull request #10152 from DSpace/backport-9893-to-main

[Port main] Fix full-text indexing for files over the character limit
This commit is contained in:
Tim Donohue
2024-12-19 14:29:13 -06:00
committed by GitHub

View File

@@ -118,20 +118,10 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
ParseContext tikaContext = new ParseContext(); ParseContext tikaContext = new ParseContext();
// Use Apache Tika to parse the full text stream(s) // Use Apache Tika to parse the full text stream(s)
boolean extractionSucceeded = false;
try (InputStream fullTextStreams = streams.getStream()) { try (InputStream fullTextStreams = streams.getStream()) {
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext); tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);
extractionSucceeded = true;
// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}
// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
} catch (SAXException saxe) { } catch (SAXException saxe) {
// Check if this SAXException is just a notice that this file was longer than the character limit. // Check if this SAXException is just a notice that this file was longer than the character limit.
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown // Unfortunately there is not a unique, public exception type to catch here. This error is thrown
@@ -141,6 +131,7 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
// log that we only indexed up to that configured limit // log that we only indexed up to that configured limit
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)." log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
+ " Only the first {} characters were indexed.", charLimit); + " Only the first {} characters were indexed.", charLimit);
extractionSucceeded = true;
} else { } else {
log.error("Tika parsing error. Could not index full text.", saxe); log.error("Tika parsing error. Could not index full text.", saxe);
throw new IOException("Tika parsing error. Could not index full text.", saxe); throw new IOException("Tika parsing error. Could not index full text.", saxe);
@@ -148,11 +139,19 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
} catch (TikaException | IOException ex) { } catch (TikaException | IOException ex) {
log.error("Tika parsing error. Could not index full text.", ex); log.error("Tika parsing error. Could not index full text.", ex);
throw new IOException("Tika parsing error. Could not index full text.", ex); throw new IOException("Tika parsing error. Could not index full text.", ex);
} finally {
// Add document to index
solr.add(doc);
} }
return; if (extractionSucceeded) {
// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}
// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
}
} }
// Add document to index // Add document to index
solr.add(doc); solr.add(doc);