mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 01:54:22 +00:00
Fix full-text indexing for files over the character limit
The error handler for files over the limit logged the correct message, but never actually added the full text to the index doc.
(cherry picked from commit 4a4a8bcb22
)
This commit is contained in:

committed by
github-actions[bot]
![github-actions[bot]](/assets/img/avatar_default.png)
parent
1abf89f9e7
commit
fc4cf8f91f
@@ -118,20 +118,10 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
||||
ParseContext tikaContext = new ParseContext();
|
||||
|
||||
// Use Apache Tika to parse the full text stream(s)
|
||||
boolean extractionSucceeded = false;
|
||||
try (InputStream fullTextStreams = streams.getStream()) {
|
||||
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);
|
||||
|
||||
// Write Tika metadata to "tika_meta_*" fields.
|
||||
// This metadata is not very useful right now,
|
||||
// but we'll keep it just in case it becomes more useful.
|
||||
for (String name : tikaMetadata.names()) {
|
||||
for (String value : tikaMetadata.getValues(name)) {
|
||||
doc.addField("tika_meta_" + name, value);
|
||||
}
|
||||
}
|
||||
|
||||
// Save (parsed) full text to "fulltext" field
|
||||
doc.addField("fulltext", tikaHandler.toString());
|
||||
extractionSucceeded = true;
|
||||
} catch (SAXException saxe) {
|
||||
// Check if this SAXException is just a notice that this file was longer than the character limit.
|
||||
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown
|
||||
@@ -141,6 +131,7 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
||||
// log that we only indexed up to that configured limit
|
||||
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
|
||||
+ " Only the first {} characters were indexed.", charLimit);
|
||||
extractionSucceeded = true;
|
||||
} else {
|
||||
log.error("Tika parsing error. Could not index full text.", saxe);
|
||||
throw new IOException("Tika parsing error. Could not index full text.", saxe);
|
||||
@@ -148,11 +139,19 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
||||
} catch (TikaException | IOException ex) {
|
||||
log.error("Tika parsing error. Could not index full text.", ex);
|
||||
throw new IOException("Tika parsing error. Could not index full text.", ex);
|
||||
} finally {
|
||||
// Add document to index
|
||||
solr.add(doc);
|
||||
}
|
||||
return;
|
||||
if (extractionSucceeded) {
|
||||
// Write Tika metadata to "tika_meta_*" fields.
|
||||
// This metadata is not very useful right now,
|
||||
// but we'll keep it just in case it becomes more useful.
|
||||
for (String name : tikaMetadata.names()) {
|
||||
for (String value : tikaMetadata.getValues(name)) {
|
||||
doc.addField("tika_meta_" + name, value);
|
||||
}
|
||||
}
|
||||
// Save (parsed) full text to "fulltext" field
|
||||
doc.addField("fulltext", tikaHandler.toString());
|
||||
}
|
||||
}
|
||||
// Add document to index
|
||||
solr.add(doc);
|
||||
|
Reference in New Issue
Block a user