Fix full-text indexing for files over the character limit

The error handler for files over the limit logged the correct message, but never actually added the full text to the index doc.

(cherry picked from commit 4a4a8bcb22)
This commit is contained in:
Brian Keese
2024-10-15 11:38:54 -05:00
committed by github-actions[bot]
parent 1abf89f9e7
commit fc4cf8f91f

View File

@@ -118,20 +118,10 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
ParseContext tikaContext = new ParseContext();
// Use Apache Tika to parse the full text stream(s)
boolean extractionSucceeded = false;
try (InputStream fullTextStreams = streams.getStream()) {
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);
// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}
// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
extractionSucceeded = true;
} catch (SAXException saxe) {
// Check if this SAXException is just a notice that this file was longer than the character limit.
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown
@@ -141,6 +131,7 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
// log that we only indexed up to that configured limit
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
+ " Only the first {} characters were indexed.", charLimit);
extractionSucceeded = true;
} else {
log.error("Tika parsing error. Could not index full text.", saxe);
throw new IOException("Tika parsing error. Could not index full text.", saxe);
@@ -148,11 +139,19 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
} catch (TikaException | IOException ex) {
log.error("Tika parsing error. Could not index full text.", ex);
throw new IOException("Tika parsing error. Could not index full text.", ex);
} finally {
// Add document to index
solr.add(doc);
}
return;
if (extractionSucceeded) {
// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}
// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
}
}
// Add document to index
solr.add(doc);