Fix full-text indexing for files over the character limit

The error handler for files over the limit logged the correct message, but never actually added the full text to the index doc. (cherry picked from commit 4a4a8bcb22)
2025-10-07 01:54:22 +00:00 · 2024-10-15 11:38:54 -05:00
parent 1abf89f9e7
commit fc4cf8f91f
1 changed files with 15 additions and 16 deletions
--- a/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java
+++ b/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java
@@ -118,20 +118,10 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
                ParseContext tikaContext = new ParseContext();

                // Use Apache Tika to parse the full text stream(s)
+                boolean extractionSucceeded = false;
                try (InputStream fullTextStreams = streams.getStream()) {
                    tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);
-
-                    // Write Tika metadata to "tika_meta_*" fields.
-                    // This metadata is not very useful right now,
-                    // but we'll keep it just in case it becomes more useful.
-                    for (String name : tikaMetadata.names()) {
-                        for (String value : tikaMetadata.getValues(name)) {
-                            doc.addField("tika_meta_" + name, value);
-                        }
-                    }
-
-                    // Save (parsed) full text to "fulltext" field
-                    doc.addField("fulltext", tikaHandler.toString());
+                    extractionSucceeded = true;
                } catch (SAXException saxe) {
                    // Check if this SAXException is just a notice that this file was longer than the character limit.
                    // Unfortunately there is not a unique, public exception type to catch here. This error is thrown
@@ -141,6 +131,7 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
                        // log that we only indexed up to that configured limit
                        log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
                                + " Only the first {} characters were indexed.", charLimit);
+                        extractionSucceeded = true;
                    } else {
                        log.error("Tika parsing error. Could not index full text.", saxe);
                        throw new IOException("Tika parsing error. Could not index full text.", saxe);
@@ -148,11 +139,19 @@ public abstract class IndexFactoryImpl<T extends IndexableObject, S> implements
                } catch (TikaException | IOException ex) {
                    log.error("Tika parsing error. Could not index full text.", ex);
                    throw new IOException("Tika parsing error. Could not index full text.", ex);
-                } finally {
-                    // Add document to index
-                    solr.add(doc);
                }
-                return;
+                if (extractionSucceeded) {
+                    // Write Tika metadata to "tika_meta_*" fields.
+                    // This metadata is not very useful right now,
+                    // but we'll keep it just in case it becomes more useful.
+                    for (String name : tikaMetadata.names()) {
+                        for (String value : tikaMetadata.getValues(name)) {
+                            doc.addField("tika_meta_" + name, value);
+                        }
+                    }
+                    // Save (parsed) full text to "fulltext" field
+                    doc.addField("fulltext", tikaHandler.toString());
+                }
            }
            // Add document to index
            solr.add(doc);