mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 10:04:21 +00:00
Merge pull request #10152 from DSpace/backport-9893-to-main
[Port main] Fix full-text indexing for files over the character limit
This commit is contained in:
@@ -118,20 +118,10 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
|||||||
ParseContext tikaContext = new ParseContext();
|
ParseContext tikaContext = new ParseContext();
|
||||||
|
|
||||||
// Use Apache Tika to parse the full text stream(s)
|
// Use Apache Tika to parse the full text stream(s)
|
||||||
|
boolean extractionSucceeded = false;
|
||||||
try (InputStream fullTextStreams = streams.getStream()) {
|
try (InputStream fullTextStreams = streams.getStream()) {
|
||||||
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);
|
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);
|
||||||
|
extractionSucceeded = true;
|
||||||
// Write Tika metadata to "tika_meta_*" fields.
|
|
||||||
// This metadata is not very useful right now,
|
|
||||||
// but we'll keep it just in case it becomes more useful.
|
|
||||||
for (String name : tikaMetadata.names()) {
|
|
||||||
for (String value : tikaMetadata.getValues(name)) {
|
|
||||||
doc.addField("tika_meta_" + name, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save (parsed) full text to "fulltext" field
|
|
||||||
doc.addField("fulltext", tikaHandler.toString());
|
|
||||||
} catch (SAXException saxe) {
|
} catch (SAXException saxe) {
|
||||||
// Check if this SAXException is just a notice that this file was longer than the character limit.
|
// Check if this SAXException is just a notice that this file was longer than the character limit.
|
||||||
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown
|
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown
|
||||||
@@ -141,6 +131,7 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
|||||||
// log that we only indexed up to that configured limit
|
// log that we only indexed up to that configured limit
|
||||||
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
|
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
|
||||||
+ " Only the first {} characters were indexed.", charLimit);
|
+ " Only the first {} characters were indexed.", charLimit);
|
||||||
|
extractionSucceeded = true;
|
||||||
} else {
|
} else {
|
||||||
log.error("Tika parsing error. Could not index full text.", saxe);
|
log.error("Tika parsing error. Could not index full text.", saxe);
|
||||||
throw new IOException("Tika parsing error. Could not index full text.", saxe);
|
throw new IOException("Tika parsing error. Could not index full text.", saxe);
|
||||||
@@ -148,11 +139,19 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
|
|||||||
} catch (TikaException | IOException ex) {
|
} catch (TikaException | IOException ex) {
|
||||||
log.error("Tika parsing error. Could not index full text.", ex);
|
log.error("Tika parsing error. Could not index full text.", ex);
|
||||||
throw new IOException("Tika parsing error. Could not index full text.", ex);
|
throw new IOException("Tika parsing error. Could not index full text.", ex);
|
||||||
} finally {
|
|
||||||
// Add document to index
|
|
||||||
solr.add(doc);
|
|
||||||
}
|
}
|
||||||
return;
|
if (extractionSucceeded) {
|
||||||
|
// Write Tika metadata to "tika_meta_*" fields.
|
||||||
|
// This metadata is not very useful right now,
|
||||||
|
// but we'll keep it just in case it becomes more useful.
|
||||||
|
for (String name : tikaMetadata.names()) {
|
||||||
|
for (String value : tikaMetadata.getValues(name)) {
|
||||||
|
doc.addField("tika_meta_" + name, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Save (parsed) full text to "fulltext" field
|
||||||
|
doc.addField("fulltext", tikaHandler.toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Add document to index
|
// Add document to index
|
||||||
solr.add(doc);
|
solr.add(doc);
|
||||||
|
Reference in New Issue
Block a user