diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/FormatFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/FormatFilter.java index 702f65a8a6..852a0c1070 100644 --- a/dspace-api/src/main/java/org/dspace/app/mediafilter/FormatFilter.java +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/FormatFilter.java @@ -44,18 +44,20 @@ public interface FormatFilter public String getFormatString(); /** - * @return string to describe the newly-generated Bitstream's - how it was + * @return string to describe the newly-generated Bitstream - how it was * produced is a good idea */ public String getDescription(); /** + * Read the source stream and produce the filtered content. + * * @param item Item * @param source * input stream * @param verbose verbosity flag * - * @return result of filter's transformation, written out to a bitstream + * @return result of filter's transformation as a byte stream. * @throws Exception if error */ public InputStream getDestinationStream(Item item, InputStream source, boolean verbose) diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/PoiWordFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/PoiWordFilter.java new file mode 100644 index 0000000000..69a4ea651b --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/PoiWordFilter.java @@ -0,0 +1,89 @@ +/** + * The contents of this file are subject to the license and copyright + * detailed in the LICENSE and NOTICE files at the root of the source + * tree and available online at + * + * http://www.dspace.org/license/ + */ +package org.dspace.app.mediafilter; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.IOException; + +import org.apache.poi.POITextExtractor; +import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.dspace.content.Item; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Extract flat text from Microsoft Word documents (.doc, .docx). + */ +public class PoiWordFilter + extends MediaFilter +{ + private static final Logger LOG = LoggerFactory.getLogger(PoiWordFilter.class); + + @Override + public String getFilteredName(String oldFilename) + { + return oldFilename + ".txt"; + } + + @Override + public String getBundleName() + { + return "TEXT"; + } + + @Override + public String getFormatString() + { + return "Text"; + } + + @Override + public String getDescription() + { + return "Extracted text"; + } + + @Override + public InputStream getDestinationStream(Item currentItem, InputStream source, boolean verbose) + throws Exception + { + try + { + // get input stream from bitstream, pass to filter, get string back + String text; + POITextExtractor extractor = ExtractorFactory.createExtractor(source); + if (extractor instanceof XWPFWordExtractor) + text = ((XWPFWordExtractor) extractor).getText(); + else if (extractor instanceof WordExtractor) + text = ((WordExtractor) extractor).getText(); + else + throw new IllegalArgumentException( + "Bitstream is neither .doc nor .docx format. Extractor returned a " + + extractor.getClass().getCanonicalName()); + + // if verbose flag is set, print out extracted text to STDOUT + if (verbose) + { + System.out.println(text); + } + + // return the extracted text as a stream. + return new ByteArrayInputStream(text.getBytes()); + } + catch (IOException ioe) + { + System.out.println("Invalid File Format"); + LOG.error("Error detected - Microsoft Word file format not recognized: " + + ioe.getMessage(), ioe); + throw ioe; + } + } +}