mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 01:54:22 +00:00
Merge pull request #9893 from bkeese/patch-1
Fix full-text indexing for files over the character limit
This commit is contained in:
@@ -118,20 +118,10 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
||||
ParseContext tikaContext = new ParseContext();
|
||||
|
||||
// Use Apache Tika to parse the full text stream(s)
|
||||
boolean extractionSucceeded = false;
|
||||
try (InputStream fullTextStreams = streams.getStream()) {
|
||||
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);
|
||||
|
||||
// Write Tika metadata to "tika_meta_*" fields.
|
||||
// This metadata is not very useful right now,
|
||||
// but we'll keep it just in case it becomes more useful.
|
||||
for (String name : tikaMetadata.names()) {
|
||||
for (String value : tikaMetadata.getValues(name)) {
|
||||
doc.addField("tika_meta_" + name, value);
|
||||
}
|
||||
}
|
||||
|
||||
// Save (parsed) full text to "fulltext" field
|
||||
doc.addField("fulltext", tikaHandler.toString());
|
||||
extractionSucceeded = true;
|
||||
} catch (SAXException saxe) {
|
||||
// Check if this SAXException is just a notice that this file was longer than the character limit.
|
||||
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown
|
||||
@@ -141,6 +131,7 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
||||
// log that we only indexed up to that configured limit
|
||||
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
|
||||
+ " Only the first {} characters were indexed.", charLimit);
|
||||
extractionSucceeded = true;
|
||||
} else {
|
||||
log.error("Tika parsing error. Could not index full text.", saxe);
|
||||
throw new IOException("Tika parsing error. Could not index full text.", saxe);
|
||||
@@ -148,11 +139,19 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
||||
} catch (TikaException | IOException ex) {
|
||||
log.error("Tika parsing error. Could not index full text.", ex);
|
||||
throw new IOException("Tika parsing error. Could not index full text.", ex);
|
||||
} finally {
|
||||
// Add document to index
|
||||
solr.add(doc);
|
||||
}
|
||||
return;
|
||||
if (extractionSucceeded) {
|
||||
// Write Tika metadata to "tika_meta_*" fields.
|
||||
// This metadata is not very useful right now,
|
||||
// but we'll keep it just in case it becomes more useful.
|
||||
for (String name : tikaMetadata.names()) {
|
||||
for (String value : tikaMetadata.getValues(name)) {
|
||||
doc.addField("tika_meta_" + name, value);
|
||||
}
|
||||
}
|
||||
// Save (parsed) full text to "fulltext" field
|
||||
doc.addField("fulltext", tikaHandler.toString());
|
||||
}
|
||||
}
|
||||
// Add document to index
|
||||
solr.add(doc);
|
||||
|
Reference in New Issue
Block a user