[DS-1140] New POI-based MS Word extractor and some comment cleanup

This commit is contained in:
Mark H. Wood
2017-02-28 17:12:23 -05:00
parent 3a03e7a9d3
commit 24c1f5367c
2 changed files with 93 additions and 2 deletions

View File

@@ -44,18 +44,20 @@ public interface FormatFilter
public String getFormatString();
/**
* @return string to describe the newly-generated Bitstream's - how it was
* @return string to describe the newly-generated Bitstream - how it was
* produced is a good idea
*/
public String getDescription();
/**
* Read the source stream and produce the filtered content.
*
* @param item Item
* @param source
* input stream
* @param verbose verbosity flag
*
* @return result of filter's transformation, written out to a bitstream
* @return result of filter's transformation as a byte stream.
* @throws Exception if error
*/
public InputStream getDestinationStream(Item item, InputStream source, boolean verbose)

View File

@@ -0,0 +1,89 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.IOException;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.dspace.content.Item;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Extract flat text from Microsoft Word documents (.doc, .docx).
*/
public class PoiWordFilter
extends MediaFilter
{
private static final Logger LOG = LoggerFactory.getLogger(PoiWordFilter.class);
@Override
public String getFilteredName(String oldFilename)
{
return oldFilename + ".txt";
}
@Override
public String getBundleName()
{
return "TEXT";
}
@Override
public String getFormatString()
{
return "Text";
}
@Override
public String getDescription()
{
return "Extracted text";
}
@Override
public InputStream getDestinationStream(Item currentItem, InputStream source, boolean verbose)
throws Exception
{
try
{
// get input stream from bitstream, pass to filter, get string back
String text;
POITextExtractor extractor = ExtractorFactory.createExtractor(source);
if (extractor instanceof XWPFWordExtractor)
text = ((XWPFWordExtractor) extractor).getText();
else if (extractor instanceof WordExtractor)
text = ((WordExtractor) extractor).getText();
else
throw new IllegalArgumentException(
"Bitstream is neither .doc nor .docx format. Extractor returned a "
+ extractor.getClass().getCanonicalName());
// if verbose flag is set, print out extracted text to STDOUT
if (verbose)
{
System.out.println(text);
}
// return the extracted text as a stream.
return new ByteArrayInputStream(text.getBytes());
}
catch (IOException ioe)
{
System.out.println("Invalid File Format");
LOG.error("Error detected - Microsoft Word file format not recognized: "
+ ioe.getMessage(), ioe);
throw ioe;
}
}
}