Merge pull request #1236 from KevinVdV/DS-2629-excel-media-filter

Ds 2629 excel media filter
This commit is contained in:
Kevin Van de Velde
2016-01-13 16:22:25 +01:00
2 changed files with 119 additions and 2 deletions

View File

@@ -0,0 +1,114 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.log4j.Logger;
import org.dspace.content.Item;
/*
* ExcelFilter
*
* Entries you must add to dspace.cfg:
*
* filter.plugins = blah, \
* Excel Text Extractor
*
* plugin.named.org.dspace.app.mediafilter.FormatFilter = \
* blah = blah, \
* org.dspace.app.mediafilter.ExcelFilter = Excel Text Extractor
*
* #Configure each filter's input Formats
* filter.org.dspace.app.mediafilter.ExcelFilter.inputFormats = Microsoft Excel, Microsoft Excel XML
*
*/
public class ExcelFilter extends MediaFilter
{
private static Logger log = Logger.getLogger(ExcelFilter.class);
public String getFilteredName(String oldFilename)
{
return oldFilename + ".txt";
}
/**
* @return String bundle name
*
*/
public String getBundleName()
{
return "TEXT";
}
/**
* @return String bitstream format
*
*
*/
public String getFormatString()
{
return "Text";
}
/**
* @return String description
*/
public String getDescription()
{
return "Extracted text";
}
/**
* @param source
* source input stream
*
* @return InputStream the resulting input stream
*/
public InputStream getDestinationStream(Item item, InputStream source, boolean verbose)
throws Exception
{
String extractedText = null;
try
{
POITextExtractor theExtractor = ExtractorFactory.createExtractor(source);
if (theExtractor instanceof ExcelExtractor)
{
// for xls file
extractedText = (theExtractor).getText();
}
else if (theExtractor instanceof XSSFExcelExtractor)
{
// for xlsx file
extractedText = (theExtractor).getText();
}
}
catch (Exception e)
{
log.error("Error filtering bitstream: " + e.getMessage(), e);
throw e;
}
if (extractedText != null)
{
// generate an input stream with the extracted text
return IOUtils.toInputStream(extractedText, StandardCharsets.UTF_8);
}
return null;
}
}

View File

@@ -332,7 +332,8 @@ http.proxy.port = ${http.proxy.port}
#Names of the enabled MediaFilter or FormatFilter plugins
filter.plugins = PDF Text Extractor, HTML Text Extractor, \
PowerPoint Text Extractor, \
Word Text Extractor, JPEG Thumbnail
Word Text Extractor, JPEG Thumbnail, \
Excel Text Extractor
# [To enable Branded Preview]: uncomment and insert the following into the plugin list
# Branded Preview JPEG, \
@@ -351,7 +352,8 @@ plugin.named.org.dspace.app.mediafilter.FormatFilter = \
org.dspace.app.mediafilter.JPEGFilter = JPEG Thumbnail, \
org.dspace.app.mediafilter.BrandedPreviewJPEGFilter = Branded Preview JPEG, \
org.dspace.app.mediafilter.ImageMagickImageThumbnailFilter = ImageMagick Image Thumbnail, \
org.dspace.app.mediafilter.ImageMagickPdfThumbnailFilter = ImageMagick PDF Thumbnail
org.dspace.app.mediafilter.ImageMagickPdfThumbnailFilter = ImageMagick PDF Thumbnail, \
org.dspace.app.mediafilter.ExcelFilter = Excel Text Extractor
#Configure each filter's input format(s)
filter.org.dspace.app.mediafilter.PDFFilter.inputFormats = Adobe PDF
@@ -362,6 +364,7 @@ filter.org.dspace.app.mediafilter.JPEGFilter.inputFormats = BMP, GIF, JPEG, imag
filter.org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.inputFormats = BMP, GIF, JPEG, image/png
filter.org.dspace.app.mediafilter.ImageMagickImageThumbnailFilter.inputFormats = BMP, GIF, image/png, JPG, TIFF, JPEG, JPEG 2000
filter.org.dspace.app.mediafilter.ImageMagickPdfThumbnailFilter.inputFormats = Adobe PDF
filter.org.dspace.app.mediafilter.ExcelFilter.inputFormats = Microsoft Excel, Microsoft Excel XML
#Publicly accessible thumbnails of restricted content.
#List the MediaFilter name's that would get publicly accessible permissions