Merge pull request #10152 from DSpace/backport-9893-to-main

[Port main] Fix full-text indexing for files over the character limit
2025-10-07 10:04:21 +00:00 · 2024-12-19 14:29:13 -06:00
parent 97dd1e0bcf fc4cf8f91f
commit 5610412376
1 changed files with 15 additions and 16 deletions
--- a/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java
+++ b/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java
@@ -118,20 +118,10 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
                ParseContext tikaContext = new ParseContext();
                // Use Apache Tika to parse the full text stream(s)
                boolean extractionSucceeded = false;
                try (InputStream fullTextStreams = streams.getStream()) {
                    tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);
-
+                    extractionSucceeded = true;
                    // Write Tika metadata to "tika_meta_*" fields.
                    // This metadata is not very useful right now,
                    // but we'll keep it just in case it becomes more useful.
                    for (String name : tikaMetadata.names()) {
                        for (String value : tikaMetadata.getValues(name)) {
                            doc.addField("tika_meta_" + name, value);
                        }
                    }
                    // Save (parsed) full text to "fulltext" field
                    doc.addField("fulltext", tikaHandler.toString());
                } catch (SAXException saxe) {
                    // Check if this SAXException is just a notice that this file was longer than the character limit.
                    // Unfortunately there is not a unique, public exception type to catch here. This error is thrown
@@ -141,6 +131,7 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
                        // log that we only indexed up to that configured limit
                        log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
                                + " Only the first {} characters were indexed.", charLimit);
                        extractionSucceeded = true;
                    } else {
                        log.error("Tika parsing error. Could not index full text.", saxe);
                        throw new IOException("Tika parsing error. Could not index full text.", saxe);
@@ -148,11 +139,19 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
                } catch (TikaException | IOException ex) {
                    log.error("Tika parsing error. Could not index full text.", ex);
                    throw new IOException("Tika parsing error. Could not index full text.", ex);
                } finally {
                    // Add document to index
                    solr.add(doc);
                }
-                return;
+                if (extractionSucceeded) {
                    // Write Tika metadata to "tika_meta_*" fields.
                    // This metadata is not very useful right now,
                    // but we'll keep it just in case it becomes more useful.
                    for (String name : tikaMetadata.names()) {
                        for (String value : tikaMetadata.getValues(name)) {
                            doc.addField("tika_meta_" + name, value);
                        }
                    }
                    // Save (parsed) full text to "fulltext" field
                    doc.addField("fulltext", tikaHandler.toString());
                }
            }
            // Add document to index
            solr.add(doc);