mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 10:04:21 +00:00
[DS-1140] New POI-based MS Word extractor and some comment cleanup
This commit is contained in:
@@ -44,18 +44,20 @@ public interface FormatFilter
|
|||||||
public String getFormatString();
|
public String getFormatString();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return string to describe the newly-generated Bitstream's - how it was
|
* @return string to describe the newly-generated Bitstream - how it was
|
||||||
* produced is a good idea
|
* produced is a good idea
|
||||||
*/
|
*/
|
||||||
public String getDescription();
|
public String getDescription();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Read the source stream and produce the filtered content.
|
||||||
|
*
|
||||||
* @param item Item
|
* @param item Item
|
||||||
* @param source
|
* @param source
|
||||||
* input stream
|
* input stream
|
||||||
* @param verbose verbosity flag
|
* @param verbose verbosity flag
|
||||||
*
|
*
|
||||||
* @return result of filter's transformation, written out to a bitstream
|
* @return result of filter's transformation as a byte stream.
|
||||||
* @throws Exception if error
|
* @throws Exception if error
|
||||||
*/
|
*/
|
||||||
public InputStream getDestinationStream(Item item, InputStream source, boolean verbose)
|
public InputStream getDestinationStream(Item item, InputStream source, boolean verbose)
|
||||||
|
@@ -0,0 +1,89 @@
|
|||||||
|
/**
|
||||||
|
* The contents of this file are subject to the license and copyright
|
||||||
|
* detailed in the LICENSE and NOTICE files at the root of the source
|
||||||
|
* tree and available online at
|
||||||
|
*
|
||||||
|
* http://www.dspace.org/license/
|
||||||
|
*/
|
||||||
|
package org.dspace.app.mediafilter;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.poi.POITextExtractor;
|
||||||
|
import org.apache.poi.extractor.ExtractorFactory;
|
||||||
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
|
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||||
|
import org.dspace.content.Item;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract flat text from Microsoft Word documents (.doc, .docx).
|
||||||
|
*/
|
||||||
|
public class PoiWordFilter
|
||||||
|
extends MediaFilter
|
||||||
|
{
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(PoiWordFilter.class);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getFilteredName(String oldFilename)
|
||||||
|
{
|
||||||
|
return oldFilename + ".txt";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getBundleName()
|
||||||
|
{
|
||||||
|
return "TEXT";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getFormatString()
|
||||||
|
{
|
||||||
|
return "Text";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getDescription()
|
||||||
|
{
|
||||||
|
return "Extracted text";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public InputStream getDestinationStream(Item currentItem, InputStream source, boolean verbose)
|
||||||
|
throws Exception
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// get input stream from bitstream, pass to filter, get string back
|
||||||
|
String text;
|
||||||
|
POITextExtractor extractor = ExtractorFactory.createExtractor(source);
|
||||||
|
if (extractor instanceof XWPFWordExtractor)
|
||||||
|
text = ((XWPFWordExtractor) extractor).getText();
|
||||||
|
else if (extractor instanceof WordExtractor)
|
||||||
|
text = ((WordExtractor) extractor).getText();
|
||||||
|
else
|
||||||
|
throw new IllegalArgumentException(
|
||||||
|
"Bitstream is neither .doc nor .docx format. Extractor returned a "
|
||||||
|
+ extractor.getClass().getCanonicalName());
|
||||||
|
|
||||||
|
// if verbose flag is set, print out extracted text to STDOUT
|
||||||
|
if (verbose)
|
||||||
|
{
|
||||||
|
System.out.println(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
// return the extracted text as a stream.
|
||||||
|
return new ByteArrayInputStream(text.getBytes());
|
||||||
|
}
|
||||||
|
catch (IOException ioe)
|
||||||
|
{
|
||||||
|
System.out.println("Invalid File Format");
|
||||||
|
LOG.error("Error detected - Microsoft Word file format not recognized: "
|
||||||
|
+ ioe.getMessage(), ioe);
|
||||||
|
throw ioe;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user