Applying SF patch #1589429 - "Self-Named" Media Filters (i.e. MediaFilter Plugins!)

[This original patch was modified slightly to leave the existing MediaFilter abstract class intact, 
and instead create a new FormatFilter interface which MediaFilter now implements]

git-svn-id: http://scm.dspace.org/svn/repo/branches/dspace-1_5_x@2393 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
Tim Donohue
2007-11-27 22:03:24 +00:00
parent 133de48176
commit 2afd7ffefa
10 changed files with 694 additions and 220 deletions

View File

@@ -182,7 +182,7 @@ public class BrandedPreviewJPEGFilter extends MediaFilter
Brand brand = new Brand((int) xsize, brandHeight, new Font(brandFont, Font.PLAIN, brandFontPoint), 5);
BufferedImage brandImage = brand.create(ConfigurationManager.getProperty("webui.preview.brand"),
ConfigurationManager.getProperty("webui.preview.brand.abbrev"),
item == null ? "" : "hdl:" + item.getHandle());
MediaFilterManager.getCurrentItem() == null ? "" : "hdl:" + MediaFilterManager.getCurrentItem().getHandle());
g2d.drawImage(brandImage, (int)0, (int)ysize, (int) xsize, (int) 20, null);

View File

@@ -0,0 +1,135 @@
/*
* FormatFilter.java
*
* Version: $Revision: 1491 $
*
* Date: $Date: 2006-03-29 21:46:42 -0500 (Wed, 29 Mar 2006) $
*
* Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
* Institute of Technology. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the Hewlett-Packard Company nor the name of the
* Massachusetts Institute of Technology nor the names of their
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
package org.dspace.app.mediafilter;
import java.io.InputStream;
import org.dspace.content.Bitstream;
import org.dspace.content.Item;
import org.dspace.core.Context;
/**
* Public interface for any class which transforms or converts content/bitstreams
* from one format to another. This interface should be implemented by any class
* which defines a "filter" to be run by the MediaFilterManager.
*/
public interface FormatFilter
{
/**
* Get a filename for a newly created filtered bitstream
*
* @param sourceName
* name of source bitstream
* @return filename generated by the filter - for example, document.pdf
* becomes document.pdf.txt
*/
public String getFilteredName(String sourceName);
/**
* @return name of the bundle this filter will stick its generated
* Bitstreams
*/
public String getBundleName();
/**
* @return name of the bitstream format (say "HTML" or "Microsoft Word")
* returned by this filter look in the bitstream format registry or
* mediafilter.cfg for valid format strings.
*/
public String getFormatString();
/**
* @return string to describe the newly-generated Bitstream's - how it was
* produced is a good idea
*/
public String getDescription();
/**
* @param source
* input stream
*
* @return result of filter's transformation, written out to a bitstream
*/
public InputStream getDestinationStream(InputStream source)
throws Exception;
/**
* Perform any pre-processing of the source bitstream *before* the actual
* filtering takes place in MediaFilterManager.processBitstream().
* <p>
* Return true if pre-processing is successful (or no pre-processing
* is necessary). Return false if bitstream should be skipped
* for any reason.
*
*
* @param c
* context
* @param item
* item containing bitstream to process
* @param source
* source bitstream to be processed
*
* @return true if bitstream processing should continue,
* false if this bitstream should be skipped
*/
public boolean preProcessBitstream(Context c, Item item, Bitstream source)
throws Exception;
/**
* Perform any post-processing of the generated bitstream *after* this
* filter has already been run.
* <p>
* Return true if pre-processing is successful (or no pre-processing
* is necessary). Return false if bitstream should be skipped
* for some reason.
*
*
* @param c
* context
* @param item
* item containing bitstream to process
* @param generatedBitstream
* the bitstream which was generated by
* this filter.
*/
public void postProcessBitstream(Context c, Item item, Bitstream generatedBitstream)
throws Exception;
}

View File

@@ -39,156 +39,64 @@
*/
package org.dspace.app.mediafilter;
import java.io.InputStream;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.Item;
import org.dspace.core.Context;
public abstract class MediaFilter
/**
* Abstract class which defines the default settings for a *simple* Media or Format Filter.
* This class may be extended by any class which wishes to define a simple filter to be run
* by the MediaFilterManager. More complex filters should likely implement the FormatFilter
* interface directly, so that they can define their own pre/postProcessing methods.
*/
public abstract class MediaFilter implements FormatFilter
{
protected Item item = null;
/* To create your own filter, implement the following virtual methods */
/**
* Get a filename for a newly created filtered bitstream
/**
* Perform any pre-processing of the source bitstream *before* the actual
* filtering takes place in MediaFilterManager.processBitstream().
* <p>
* Return true if pre-processing is successful (or no pre-processing
* is necessary). Return false if bitstream should be skipped
* for any reason.
*
* @param sourceName
* name of source bitstream
* @return filename generated by the filter - for example, document.pdf
* becomes document.pdf.txt
*/
public abstract String getFilteredName(String sourceName);
/**
* @return name of the bundle this filter will stick its generated
* Bitstreams
*/
public abstract String getBundleName();
/**
* @return name of the bitstream format (say "HTML" or "Microsoft Word")
* returned by this filter look in the bitstream format registry or
* mediafilter.cfg for valid format strings.
*/
public abstract String getFormatString();
/**
* @return string to describe the newly-generated Bitstream's - how it was
* produced is a good idea
*/
public abstract String getDescription();
/**
* @param source
* input stream
*
* @return result of filter's transformation, written out to a bitstream
*/
public abstract InputStream getDestinationStream(InputStream source)
throws Exception;
/* end of methods you need to implement! */
/**
* processBitstream is a utility class that calls the above virtual methods -
* it is unlikely that you will need to override it. It scans the bitstreams
* in an item, and decides if a bitstream has already been filtered, and if
* not or if overWrite is set, invokes the filter.
*
* @param c
* context
* @param item
* item containing bitstream to process
* @param source
* source bitstream to process
* source bitstream to be processed
*
* @return true if new rendition is created, false if rendition already
* exists and overWrite is not set
* @return true if bitstream processing should continue,
* false if this bitstream should be skipped
*/
public boolean processBitstream(Context c, Item item, Bitstream source)
public boolean preProcessBitstream(Context c, Item item, Bitstream source)
throws Exception
{
boolean overWrite = MediaFilterManager.isForce;
this.item = item;
// get bitstream filename, calculate destination filename
String newName = getFilteredName(source.getName());
Bitstream existingBitstream = null; // is there an existing rendition?
Bundle targetBundle = null; // bundle we're modifying
Bundle[] bundles = item.getBundles(getBundleName());
// check if destination bitstream exists
if (bundles.length > 0)
{
// only finds the last match (FIXME?)
for (int i = 0; i < bundles.length; i++)
{
Bitstream[] bitstreams = bundles[i].getBitstreams();
for (int j = 0; j < bitstreams.length; j++)
{
if (bitstreams[j].getName().equals(newName))
{
targetBundle = bundles[i];
existingBitstream = bitstreams[j];
}
}
}
}
// if exists and overwrite = false, exit
if (!overWrite && (existingBitstream != null))
{
System.out.println("SKIPPED: bitstream " + source.getID()
+ " because '" + newName + "' already exists");
return false;
}
InputStream destStream = getDestinationStream(source.retrieve());
// create new bundle if needed
if (bundles.length < 1)
{
targetBundle = item.createBundle(getBundleName());
}
else
{
// take the first match
targetBundle = bundles[0];
}
Bitstream b = targetBundle.createBitstream(destStream);
// Now set the format and name of the bitstream
b.setName(newName);
b.setSource("Written by MediaFilter " + this.getClass().getName()); // or
// obj.getClass().getName();
b.setDescription(getDescription());
// Find the proper format
BitstreamFormat bf = BitstreamFormat.findByShortDescription(c,
getFormatString());
b.setFormat(bf);
b.update();
// fixme - set date?
// we are overwriting, so remove old bitstream
if (existingBitstream != null)
{
targetBundle.removeBitstream(existingBitstream);
}
System.out.println("FILTERED: bitstream " + source.getID()
+ " and created '" + newName + "'");
return true;
return true; //default to no pre-processing
}
/**
* Perform any post-processing of the generated bitstream *after* this
* filter has already been run.
* <p>
* Return true if pre-processing is successful (or no pre-processing
* is necessary). Return false if bitstream should be skipped
* for some reason.
*
*
* @param c
* context
* @param item
* item containing bitstream to process
* @param generatedBitstream
* the bitstream which was generated by
* this filter.
*/
public void postProcessBitstream(Context c, Item item, Bitstream generatedBitstream)
throws Exception
{
//default to no post-processing necessary
}
}

View File

@@ -40,22 +40,30 @@
package org.dspace.app.mediafilter;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.List;
import java.util.Arrays;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.MissingArgumentException;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.dspace.authorize.AuthorizeManager;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.content.DCDate;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.ItemIterator;
@@ -63,11 +71,12 @@ import org.dspace.core.ConfigurationManager;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.core.PluginManager;
import org.dspace.core.SelfNamedPlugin;
import org.dspace.handle.HandleManager;
import org.dspace.search.DSIndexer;
/**
* MediaFilterManager is the class that invokes the media filters over the
* MediaFilterManager is the class that invokes the media/format filters over the
* repository's content. a few command line flags affect the operation of the
* MFM: -v verbose outputs all extracted text to STDOUT; -f force forces all
* bitstreams to be processed, even if they have been before; -n noindex does not
@@ -77,6 +86,15 @@ import org.dspace.search.DSIndexer;
*/
public class MediaFilterManager
{
//key (in dspace.cfg) which lists all enabled filters by name
public static String MEDIA_FILTER_PLUGINS_KEY = "filter.plugins";
//prefix (in dspace.cfg) for all filter properties
public static String FILTER_PREFIX = "filter";
//suffix (in dspace.cfg) for input formats supported by each filter
public static String INPUT_FORMATS_SUFFIX = "inputFormats";
public static boolean updateIndex = true; // default to updating index
public static boolean isVerbose = false; // default to not verbose
@@ -85,14 +103,20 @@ public class MediaFilterManager
public static String identifier = null; // object scope limiter
public static int max2Process = Integer.MAX_VALUE; // maximum number to process
public static int max2Process = Integer.MAX_VALUE; // maximum number items to process
public static int processed = 0; // number processed
public static int processed = 0; // number items processed
private static MediaFilter[] filterClasses = null;
private static Item currentItem = null; // current item being processed
private static FormatFilter[] filterClasses = null;
private static Map filterFormats = new HashMap();
//separator in filterFormats Map between a filter class name and a plugin name,
//for MediaFilters which extend SelfNamedPlugin (\034 is "file separator" char)
public static String FILTER_PLUGIN_SEPARATOR = "\034";
public static void main(String[] argv) throws Exception
{
// set headless for non-gui workstations
@@ -117,12 +141,35 @@ public class MediaFilterManager
"process no more than maximum items");
options.addOption("h", "help", false, "help");
CommandLine line = parser.parse(options, argv);
//create a "plugin" option (to specify specific MediaFilter plugins to run)
OptionBuilder.withLongOpt("plugins");
OptionBuilder.withValueSeparator(',');
OptionBuilder.withDescription(
"ONLY run the specified Media Filter plugin(s)\n" +
"listed from '" + MEDIA_FILTER_PLUGINS_KEY + "' in dspace.cfg.\n" +
"Separate multiple with a comma (,)\n" +
"(e.g. MediaFilterManager -p \n\"Word Text Extraction\",\"PDF Text Extraction\")");
Option pluginOption = OptionBuilder.create('p');
pluginOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args
options.addOption(pluginOption);
CommandLine line = null;
try
{
line = parser.parse(options, argv);
}
catch(MissingArgumentException e)
{
System.out.println("ERROR: " + e.getMessage());
HelpFormatter myhelp = new HelpFormatter();
myhelp.printHelp("MediaFilterManager\n", options);
System.exit(1);
}
if (line.hasOption('h'))
{
HelpFormatter myhelp = new HelpFormatter();
myhelp.printHelp("MediaFilter\n", options);
myhelp.printHelp("MediaFilterManager\n", options);
System.exit(0);
}
@@ -158,20 +205,110 @@ public class MediaFilterManager
}
}
// set up filters
filterClasses =
(MediaFilter[])PluginManager.getPluginSequence(MediaFilter.class);
for (int i = 0; i < filterClasses.length; i++)
String filterNames[] = null;
if(line.hasOption('p'))
{
String filterName = filterClasses[i].getClass().getName();
String formats = ConfigurationManager.getProperty(
"filter." + filterName + ".inputFormats");
if (formats != null)
{
filterFormats.put(filterName, Arrays.asList(formats.split(",[\\s]*")));
}
}
//specified which media filter plugins we are using
filterNames = line.getOptionValues('p');
if(filterNames==null || filterNames.length==0)
{ //display error, since no plugins specified
System.err.println("\nERROR: -p (-plugin) option requires at least one plugin to be specified.\n" +
"(e.g. MediaFilterManager -p \"Word Text Extractor\",\"PDF Text Extractor\")\n");
HelpFormatter myhelp = new HelpFormatter();
myhelp.printHelp("MediaFilterManager\n", options);
System.exit(1);
}
}
else
{
//retrieve list of all enabled media filter plugins!
String enabledPlugins = ConfigurationManager.getProperty(MEDIA_FILTER_PLUGINS_KEY);
filterNames = enabledPlugins.split(",\\s*");
}
//initialize an array of our enabled filters
List filterList = new ArrayList();
//set up each filter
for(int i=0; i< filterNames.length; i++)
{
//get filter of this name & add to list of filters
FormatFilter filter = (FormatFilter) PluginManager.getNamedPlugin(FormatFilter.class, filterNames[i]);
if(filter==null)
{
System.err.println("\nERROR: Unknown MediaFilter specified (either from command-line or in dspace.cfg): '" + filterNames[i] + "'");
System.exit(1);
}
else
{
filterList.add(filter);
String filterClassName = filter.getClass().getName();
String pluginName = null;
//If this filter is a SelfNamedPlugin,
//then the input formats it accepts may differ for
//each "named" plugin that it defines.
//So, we have to look for every key that fits the
//following format: filter.<class-name>.<plugin-name>.inputFormats
if( SelfNamedPlugin.class.isAssignableFrom(filter.getClass()) )
{
//Get the plugin instance name for this class
pluginName = ((SelfNamedPlugin) filter).getPluginInstanceName();
}
//Retrieve our list of supported formats from dspace.cfg
//For SelfNamedPlugins, format of key is:
// filter.<class-name>.<plugin-name>.inputFormats
//For other MediaFilters, format of key is:
// filter.<class-name>.inputFormats
String formats = ConfigurationManager.getProperty(
FILTER_PREFIX + "." + filterClassName +
(pluginName!=null ? "." + pluginName : "") +
"." + INPUT_FORMATS_SUFFIX);
//add to internal map of filters to supported formats
if (formats != null)
{
//For SelfNamedPlugins, map key is:
// <class-name><separator><plugin-name>
//For other MediaFilters, map key is just:
// <class-name>
filterFormats.put(filterClassName +
(pluginName!=null ? FILTER_PLUGIN_SEPARATOR + pluginName : ""),
Arrays.asList(formats.split(",[\\s]*")));
}
}//end if filter!=null
}//end for
//If verbose, print out loaded mediafilter info
if(isVerbose)
{
System.out.println("The following MediaFilters are enabled: ");
java.util.Iterator i = filterFormats.keySet().iterator();
while(i.hasNext())
{
String filterName = (String) i.next();
System.out.println("Full Filter Name: " + filterName);
String pluginName = null;
if(filterName.contains(FILTER_PLUGIN_SEPARATOR))
{
String[] fields = filterName.split(FILTER_PLUGIN_SEPARATOR);
filterName=fields[0];
pluginName=fields[1];
}
System.out.println(filterName +
(pluginName!=null? " (Plugin: " + pluginName + ")": ""));
}
}
//store our filter list into an internal array
filterClasses = (MediaFilter[]) filterList.toArray(new MediaFilter[filterList.size()]);
Context c = null;
try
@@ -270,6 +407,11 @@ public class MediaFilterManager
public static void applyFiltersItem(Context c, Item item) throws Exception
{
//cache this item in MediaFilterManager
//so it can be accessed by MediaFilters as necessary
currentItem = item;
if (filterItem(c, item))
{
// commit changes after each filtered item
@@ -277,8 +419,9 @@ public class MediaFilterManager
// increment processed count
++processed;
}
// clear item objects from context cache
// clear item objects from context cache and internal cache
item.decache();
currentItem = null;
}
/**
@@ -325,14 +468,33 @@ public class MediaFilterManager
// by more than one filter
for (int i = 0; i < filterClasses.length; i++)
{
List fmts = (List)filterFormats.get(filterClasses[i].getClass().getName());
if (fmts.contains(myBitstream.getFormat().getShortDescription()))
//List fmts = (List)filterFormats.get(filterClasses[i].getClass().getName());
String pluginName = null;
//if this filter class is a SelfNamedPlugin,
//its list of supported formats is different for
//differently named "plugin"
if( SelfNamedPlugin.class.isAssignableFrom(filterClasses[i].getClass()) )
{
//get plugin instance name for this media filter
pluginName = ((SelfNamedPlugin)filterClasses[i]).getPluginInstanceName();
}
//Get list of supported formats for the filter (and possibly named plugin)
//For SelfNamedPlugins, map key is:
// <class-name><separator><plugin-name>
//For other MediaFilters, map key is just:
// <class-name>
List fmts = (List)filterFormats.get(filterClasses[i].getClass().getName() +
(pluginName!=null ? FILTER_PLUGIN_SEPARATOR + pluginName : ""));
if (fmts.contains(myBitstream.getFormat().getShortDescription()))
{
try
{
// only update item if bitstream not skipped
if (filterClasses[i].processBitstream(c, myItem, myBitstream))
{
if (processBitstream(c, myItem, myBitstream, filterClasses[i]))
{
myItem.update(); // Make sure new bitstream has a sequence
// number
filtered = true;
@@ -348,4 +510,135 @@ public class MediaFilterManager
}
return filtered;
}
/**
* processBitstream is a utility class that calls the virtual methods
* from the current MediaFilter class.
* It scans the bitstreams in an item, and decides if a bitstream has
* already been filtered, and if not or if overWrite is set, invokes the
* filter.
*
* @param c
* context
* @param item
* item containing bitstream to process
* @param source
* source bitstream to process
* @param mediaFilter
* MediaFilter to perform filtering
*
* @return true if new rendition is created, false if rendition already
* exists and overWrite is not set
*/
public static boolean processBitstream(Context c, Item item, Bitstream source, FormatFilter formatFilter)
throws Exception
{
//do pre-processing of this bitstream, and if it fails, skip this bitstream!
if(!formatFilter.preProcessBitstream(c, item, source))
return false;
boolean overWrite = MediaFilterManager.isForce;
// get bitstream filename, calculate destination filename
String newName = formatFilter.getFilteredName(source.getName());
Bitstream existingBitstream = null; // is there an existing rendition?
Bundle targetBundle = null; // bundle we're modifying
Bundle[] bundles = item.getBundles(formatFilter.getBundleName());
// check if destination bitstream exists
if (bundles.length > 0)
{
// only finds the last match (FIXME?)
for (int i = 0; i < bundles.length; i++)
{
Bitstream[] bitstreams = bundles[i].getBitstreams();
for (int j = 0; j < bitstreams.length; j++)
{
if (bitstreams[j].getName().equals(newName))
{
targetBundle = bundles[i];
existingBitstream = bitstreams[j];
}
}
}
}
// if exists and overwrite = false, exit
if (!overWrite && (existingBitstream != null))
{
System.out.println("SKIPPED: bitstream " + source.getID()
+ " because '" + newName + "' already exists");
return false;
}
InputStream destStream = formatFilter.getDestinationStream(source.retrieve());
// create new bundle if needed
if (bundles.length < 1)
{
targetBundle = item.createBundle(formatFilter.getBundleName());
}
else
{
// take the first match
targetBundle = bundles[0];
}
Bitstream b = targetBundle.createBitstream(destStream);
// Now set the format and name of the bitstream
b.setName(newName);
b.setSource("Written by FormatFilter " + formatFilter.getClass().getName() +
" on " + DCDate.getCurrent() + " (GMT).");
b.setDescription(formatFilter.getDescription());
// Find the proper format
BitstreamFormat bf = BitstreamFormat.findByShortDescription(c,
formatFilter.getFormatString());
b.setFormat(bf);
b.update();
//UIUC change - inherit policies from the source bitstream
//(first remove any existing policies)
AuthorizeManager.removeAllPolicies(c, b);
AuthorizeManager.inheritPolicies(c, source, b);
// fixme - set date?
// we are overwriting, so remove old bitstream
if (existingBitstream != null)
{
targetBundle.removeBitstream(existingBitstream);
}
System.out.println("FILTERED: bitstream " + source.getID()
+ " and created '" + newName + "'");
//do post-processing of the generated bitstream
formatFilter.postProcessBitstream(c, item, b);
return true;
}
/**
* Return the item that is currently being processed/filtered
* by the MediaFilterManager
* <p>
* This allows FormatFilters to retrieve the Item object
* in case they need access to item-level information for their format
* transformations/conversions.
*
* @return current Item being processed by MediaFilterManager
*/
public static Item getCurrentItem()
{
return currentItem;
}
/** END UIUC Change **/
}

View File

@@ -61,6 +61,7 @@ SF Patch 1794700 Bug fix for stat-monthly and stat-report-monthly
(Tim Donohue)
- Added Configurable Submission to JSP-UI and XML-UI (updated version of SF Patch #1691345)
- SF Patch #1589429 "Self-Named" Media Filters (i.e. MediaFilter Plugins) (updated version of this patch)
(Stuart Lewis)
- SF Patch #1737792 Patch for bug 1552760 - Submit interface looks bad in Safari

View File

@@ -784,15 +784,25 @@ ldap.enable = false
# will be left blank in the new eperson object.
#ldap.phone_field = telephoneNumber
#### Media Filter plugins (through PluginManager) ####
plugin.sequence.org.dspace.app.mediafilter.MediaFilter = \
org.dspace.app.mediafilter.PDFFilter, org.dspace.app.mediafilter.HTMLFilter, \
org.dspace.app.mediafilter.WordFilter, org.dspace.app.mediafilter.JPEGFilter
# to enable branded preview: remove last line above, and uncomment 2 lines below
# org.dspace.app.mediafilter.WordFilter, org.dspace.app.mediafilter.JPEGFilter, \
# org.dspace.app.mediafilter.BrandedPreviewJPEGFilter
#### Media Filter / Format Filter plugins (through PluginManager) ####
#Names of the enabled MediaFilter or FormatFilter plugins
filter.plugins = PDF Text Extractor, HTML Text Extractor, \
Word Text Extractor, JPEG Thumbnail
# [To enable Branded Preview]: remove last line above, and uncomment 2 lines below
# Word Text Extractor, JPEG Thumbnail, \
# Branded Preview JPEG
#Assign 'human-understandable' names to each filter
plugin.named.org.dspace.app.mediafilter.FormatFilter = \
org.dspace.app.mediafilter.PDFFilter = PDF Text Extractor, \
org.dspace.app.mediafilter.HTMLFilter = HTML Text Extractor, \
org.dspace.app.mediafilter.WordFilter = Word Text Extractor, \
org.dspace.app.mediafilter.JPEGFilter = JPEG Thumbnail, \
org.dspace.app.mediafilter.BrandedPreviewJPEGFilter = Branded Preview JPEG
#Configure each filter's input format(s)
filter.org.dspace.app.mediafilter.PDFFilter.inputFormats = Adobe PDF
filter.org.dspace.app.mediafilter.HTMLFilter.inputFormats = HTML, Text
filter.org.dspace.app.mediafilter.WordFilter.inputFormats = Microsoft Word

View File

@@ -949,41 +949,64 @@ dsrun org.dspace.app.itemimport.ItemImport -a -e joe@user.com -c collectionID -
<h2><a name="mediafilters" id="mediafilters">MediaFilters: Transforming DSpace Content</a></h2>
<p>DSpace can apply filters to content/bitstreams, creating new content. Filters are included that extract text for <strong>full-text searching</strong>, and create <strong>thumbnails</strong> for items that contain images. The media filters are controlled by the <code>MediaFilterManager</code> which traverses the asset store, invoking the <code>MediaFilter</code> subclasses on bitstreams. The MediaFilter plugin config item <code>plugin.named.org.dspace.app.mediafilter.MediaFilter</code> in <code>dspace.cfg</code> contains a list of bitstream format types and the filters that operate on bitstreams of that type. The media filter system is intended to be run from the command line (or regularly as a cron task):</p>
<p>DSpace can apply filters to content/bitstreams, creating new content. Filters are included that extract text for <strong>full-text searching</strong>, and create <strong>thumbnails</strong> for items that contain images.
The media filters are controlled by the <code>MediaFilterManager</code> which traverses the asset store, invoking the <code>MediaFilter</code> or <code>FormatFilter</code> classes on bitstreams.
The media filter plugin configuration <code>filter.plugins</code> in <code>dspace.cfg</code> contains a list of all enabled media/format filter plugins (see <a href="configure.html#mediafilters">Configuring Media Filters</a> for more information).
The media filter system is intended to be run from the command line (or regularly as a cron task):</p>
<pre>
dspace/bin/filter-media
[dspace]/bin/filter-media
</pre>
<p>Traverse the asset store, applying media filters to bitstreams, skipping bitstreams that have already been filtered.</p>
<pre>
dspace/bin/filter-media -f
</pre>
<p>Apply filters to ALL bitstreams, even if they've already been filtered.</p>
<pre>
dspace/bin/filter-media -v
</pre>
<p>Verbose mode - print all extracted text and other filter details to STDOUT.</p>
<pre>
dspace/bin/filter-media -n
</pre>
<p>Suppress index creation - by default, a new search index is created for full-text searching. This option suppresses index creation if you intend to run <code>index-all</code> elsewhere.</p>
<p>With no options, this traverses the asset store, applying media filters to bitstreams, and skipping bitstreams that have already been filtered.</p>
<pre>
dspace/bin/filter-media -i 123456789/2
</pre>
<p><strong>Available Command-Line Options:</strong></p>
<p>Restrict processing to the community, collection, or item named by the identifier - by default, all bitstreams of all items in the repository are processed. The identifier must be a handle, not a DB key. This option may be combined with any other option.</p>
<ul>
<li><code>[dspace]/bin/filter-media -h</code>
<ul>
<li>Display help message describing all command-line options.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -f</code>
<ul>
<li>"Force" mode - Apply filters to ALL bitstreams, even if they've already been filtered. If they've already been filtered, the previously filtered content is overwritten.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -i 123456789/2</code>
<ul>
<li>Restrict processing to the community, collection, or item named by the identifier - by default, all bitstreams of all items in the repository are processed. The identifier must be a Handle, not a DB key. This option may be combined with any other option.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -m 1000</code>
<ul>
<li>Suspend operation after the specified maximum number of items have been processed - by default, no limit exists. This option may be combined with any other option.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -n</code>
<ul>
<li>Suppress index creation - by default, a new search index is created for full-text searching. This option suppresses index creation if you intend to run <code>index-all</code> elsewhere.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -p "PDF Text Extractor","Word Text Extractor"</code>
<ul>
<li>Apply ONLY the filter plugin(s) listed (separated by commas). By default all named filters listed in the <code>filter.plugins</code> field of <code>dspace.cfg</code> are applied. This option may be combined with any other option. <em>WARNING:</em> multiple plugin names must be separated by a comma (i.e. ',') and NOT a comma followed by a space (i.e. ', ').</li>
</ul>
</li>
<pre>
dspace/bin/filter-media -m 1000
</pre>
<li><code>[dspace]/bin/filter-media -v</code>
<ul>
<li>Verbose mode - print all extracted text and other filter details to STDOUT.</li>
</ul>
</li>
<p>Suspend operation after the specified maximum number of items have been processed - by default, no limit exists. This option may be combined with any other option.</p>
</ul>
<p>Adding your own filters is done by creating a sub-class of the <code>MediaFilter</code> class. See the comments in the source file MediaFilter.java for more information. In theory filters could be implemented in any language (C, Perl, etc.) They only need to be invoked by the Java code in the <code>MediaFilter</code> class that you create.</p>
<p>Adding your own filters is done by creating a class which <code>implements</code> the <code>org.dspace.app.mediafilter.FormatFilter</code> interface. See the <a href="configure.html#newfilter">Creating a new Media Filter</a> topic and comments in the source file FormatFilter.java for more information. In theory filters could be implemented in any programming language (C, Perl, etc.) However, they need to be invoked by the Java code in the Media Filter class that you create.</p>
<h2><a name="filiator" id="filiator">Sub-Community Management</a></h2>

View File

@@ -425,7 +425,7 @@ packager plugin's implementation.</p>
<p>See the <a href="#pluginmethods">getNamedPlugin()</a> method and the getPluginNames() methods.</p></li>
</ol></p>
<h4>Self-Named Plugins</h4>
<h4><a name="selfnamedplugin">Self-Named Plugins</a></h4>
<p>Named plugins can get their names either from the configuration or, for a variant called self-named plugins, from within the plugin itself.</p>
@@ -513,7 +513,7 @@ packager plugin's implementation.</p>
<h4>SelfNamedPlugin Class</h4>
<p>A named plugin implementation must extend this class if it wants to supply its own Plugin Name(s). See Self-Named Plugins for why this is sometimes necessary.</p>
<p>A named plugin implementation must extend this class if it wants to supply its own Plugin Name(s). See <a href="#selfnamedplugin">Self-Named Plugins</a> for why this is sometimes necessary.</p>
<pre>abstract class SelfNamedPlugin
{
@@ -544,11 +544,11 @@ packager plugin's implementation.</p>
<p>This is a <code>RuntimeException</code> so it doesn't have to be declared, and can be passed all the way up to a generalized fatal exception handler.</p>
<h3>Configuring Plugins</h3>
<h3><a name="pluginconfig">Configuring Plugins</a></h3>
<p>All of the Plugin Manager's configuration comes from the DSpace Configuration Manager, which is a Java Properties map. You can configure these characteristics of each plugin:
<ol>
<li><strong>Interface</strong>: Classname of the Java interface which defines the plugin, including package name. e.g. <code>org.dspace.app.mediafilter.MediaFilter</code></li>
<li><strong>Interface</strong>: Classname of the Java interface which defines the plugin, including package name. e.g. <code>org.dspace.app.mediafilter.FormatFilter</code></li>
<li><strong>Implementation Class</strong>: Classname of the implementation class, including package. e.g. <code>org.dspace.app.mediafilter.PDFFilter</code></li>
<li><strong>Names</strong>: (Named plugins only) There are two ways to bind names to plugins: listing them in the value of a plugin.named.interface key, or configuring a class in <code>plugin.selfnamed.<i>interface</i></code> which extends the <code>SelfNamedPlugin</code> class.</li>
<li><strong>Reusable option</strong>: (Optional) This is declared in a <code>plugin.reusable</code> configuration line. Plugins are reusable by default, so you only need to configure the non-reusable ones.</li>

View File

@@ -1141,28 +1141,132 @@ search.index.11 = id:dc.identifier.*
<p><strong>NOTE:</strong> While the indexes are created, this only affects the search results and has no effect on the search components of the user interface. To add new search capability (e.g. to add a new search category to the Advanced Search) requires local customisation to the user interface.</p>
<h2><a name="mediafilters" id="mediafilters">Media Filters</a></h2>
<h2><a name="mediafilters" id="mediafilters">Configuring Media Filters</a></h2>
<p>Media Filters are classes used to generate derivative or alternative versions of master bitstreams. For example, the PDF Media Filter will extract textual content from PDF bitstreams, the JPEG Media Filter can create thumbnails from image bitstreams.</p>
<p>Media or Format Filters are classes used to generate derivative or alternative versions of content or bitstreams within DSpace. For example, the PDF Media Filter will extract textual content from PDF bitstreams, the JPEG Media Filter can create thumbnails from image bitstreams.</p>
<p>Media Filters are configured as a <a href="business.html#plugin">Sequence Plugin</a>, with each filter also having a separate config item indicating which formats it can process. The default configuration is shown below.</p>
<p>Media Filters are configured as <a href="business.html#plugin">Named Plugins</a>, with each filter also having a separate configuration setting (in <code>dspace.cfg</code>) indicating which formats it can process. The default configuration is shown below.</p>
<p><pre>#### Media Filter plugins (through PluginManager) ####
<p><pre>
#### Media Filter / Format Filter plugins (through PluginManager) ####
plugin.sequence.org.dspace.app.mediafilter.MediaFilter = \
org.dspace.app.mediafilter.PDFFilter, org.dspace.app.mediafilter.HTMLFilter, \
org.dspace.app.mediafilter.WordFilter, org.dspace.app.mediafilter.JPEGFilter
# to enable branded preview: remove last line above, and uncomment 2 lines below
# org.dspace.app.mediafilter.WordFilter, org.dspace.app.mediafilter.JPEGFilter, \
# org.dspace.app.mediafilter.BrandedPreviewJPEGFilter
#Names of the enabled MediaFilter or FormatFilter plugins
filter.plugins = PDF Text Extractor, HTML Text Extractor, \
Word Text Extractor, JPEG Thumbnail
# to enable branded preview: remove last line above, and uncomment 2 lines below
# Word Text Extractor, JPEG Thumbnail, \
# Branded Preview JPEG
filter.org.dspace.app.mediafilter.PDFFilter.inputFormats = Adobe PDF
filter.org.dspace.app.mediafilter.HTMLFilter.inputFormats = HTML, Text
filter.org.dspace.app.mediafilter.WordFilter.inputFormats = Microsoft Word
filter.org.dspace.app.mediafilter.JPEGFilter.inputFormats = GIF, JPEG, image/png
filter.org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.inputFormats = GIF, JPEG, image/png</pre></p>
#Assign 'human-understandable' names to each filter
plugin.named.org.dspace.app.mediafilter.FormatFilter = \
org.dspace.app.mediafilter.PDFFilter = PDF Text Extractor, \
org.dspace.app.mediafilter.HTMLFilter = HTML Text Extractor, \
org.dspace.app.mediafilter.WordFilter = Word Text Extractor, \
org.dspace.app.mediafilter.JPEGFilter = JPEG Thumbnail, \
org.dspace.app.mediafilter.BrandedPreviewJPEGFilter = Branded Preview JPEG
<p>To add a new Media Filter, add the new filter class to the <code>plugin.sequence.org.dspace.app.mediafilter.MediaFilter</code> config item and add a corresponding <code>filter.<i>&lt;class path&gt;</i>.inputFormats</code> config item. Note the input formats must match the <code>short_description</code> field in the <code>bitstreamformatregistry</code> table.</p>
#Configure each filter's input format(s)
filter.org.dspace.app.mediafilter.PDFFilter.inputFormats = Adobe PDF
filter.org.dspace.app.mediafilter.HTMLFilter.inputFormats = HTML, Text
filter.org.dspace.app.mediafilter.WordFilter.inputFormats = Microsoft Word
filter.org.dspace.app.mediafilter.JPEGFilter.inputFormats = GIF, JPEG, image/png
filter.org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.inputFormats = GIF, JPEG, image/png</pre></p>
<p>The enabled Media/Format Filters are named in the <code>filter.plugins</code> field above.</p>
<p>Names are assigned to each filter using the <code>plugin.named.org.dspace.app.mediafilter.FormatFilter</code> field
(e.g. by default the <code>PDFFilter</code> is named "PDF Text Extractor").</p>
<p>Finally the appropriate <code>filter.<em>&lt;class path&gt;</em>.inputFormats</code> defines the vaild input formats which each filter can be applied to. These
format names <strong>must match</strong> the <code>short description</code> field of the <a href="appendix.html#bitstreamformatregistry">Bitstream Format Registry</a>.</p>
<p>You can also implement more dynamic or configurable Media/Format Filters which extend <a href="business.html#selfnamedplugin"><code>SelfNamedPlugin</code></a>.
More information is provide below in <a href="#newfilter">Creating a new Media/Format Filter</a></p>
<h3><a name="newfilter">Creating a new Media/Format Filter</a></h3>
<h4>Creating a simple Media Filter</h4>
<p>New Media Filters <strong>must implement</strong> the <code>org.dspace.app.mediafilter.FormatFilter</code> interface. More information on the methods you need to implement is provided in the <code>FormatFilter.java</code> source file. For example:
<code><pre>
public class MySimpleMediaFilter implements FormatFilter</pre></code>
</p>
<p>Alternatively, you could extend the <code>org.dspace.app.mediafilter.MediaFilter</code> class, which just defaults to performing no pre/post-processing of bitstreams before or after filtering.
<code><pre>
public class MySimpleMediaFilter extends MediaFilter</pre></code>
</p>
<p>You must give your new filter a "name", by adding it and its name to the <code>plugin.named.org.dspace.app.mediafilter.FormatFilter</code> field in <code>dspace.cfg</code>.
In addition to naming your filter, make sure to specify its input formats in the
<code>filter.<i>&lt;class path&gt;</i>.inputFormats</code> config item. Note the input formats must match the <code>short description</code> field in the <a href="appendix.html#bitstreamformatregistry">Bitstream Format Registry</a> (i.e. <code>bitstreamformatregistry</code> table).
<code><pre>
plugin.named.org.dspace.app.mediafilter.FormatFilter = \
org.dspace.app.mediafilter.MySimpleMediaFilter = My Simple Text Filter, \
...
filter.org.dspace.app.mediafilter.MySimpleMediaFilter.inputFormats = Text</pre></code>
</p>
<em>WARNING: If you neglect to define the <code>inputFormats</code> for
a particular filter, the <code>MediaFilterManager</code> will never call that filter, since it will never find a bitstream which has a format
matching that filter's input format(s).</em>
<p>If you have a complex Media Filter class, which actually performs different filtering for different formats (e.g. conversion from Word to PDF <strong>and</strong> conversion from Excel to CSV), you should define this as a <a href="selfnamedfilter">Dynamic / Self-Named Format Filter</a>.
<h4>Creating a Dynamic or "Self-Named" Format Filter</h4>
<p>If you have a more complex Media/Format Filter, which actually performs <strong>multiple</strong> filtering or conversions for different formats (e.g. conversion from Word to PDF <strong>and</strong> conversion from Excel to CSV), you should have define a class which implements the <code>FormatFilter</code> interface,
while also extending the <a href="business.html#selfnamedplugin"><code>SelfNamedPlugin</code></a> class. For example:
<code><pre>
public class MyComplexMediaFilter extends SelfNamedPlugin implements FormatFilter</pre></code>
</p>
<p>Since <code>SelfNamedPlugins</code> are self-named (as stated), they must provide the various names the plugin uses by defining a <a href="business.html#pluginmethods">getPluginNames()</a> method</a>.
Generally speaking, each "name" the plugin uses should correspond to a different type of filter it implements (e.g. "Word2PDF" and "Excel2CSV" are two good names for a complex media filter which performs both Word to PDF and Excel to CSV conversions).
</p>
<p>Self-Named Media/Format Filters are also configured differently in <code>dspace.cfg</code>. Below is a general template for a Self Named Filter (defined by an imaginary <code>MyComplexMediaFilter</code> class, which
can perform both Word to PDF and Excel to CSV conversions):</p>
<p><code><pre>
#Add to a list of all Self Named filters
plugin.selfnamed.org.dspace.app.mediafilter.FormatFilter = \
org.dspace.app.mediafilter.MyComplexMediaFilter
#Define input formats for each "named" plugin this filter implements
filter.org.dspace.app.mediafilter.MyComplexMediaFilter.Word2PDF.inputFormats = Microsoft Word
filter.org.dspace.app.mediafilter.MyComplexMediaFilter.Excel2CSV.inputFormats = Microsoft Excel</pre>
</code></p>
<p>As shown above, each Self-Named Filter class must be listed in the <code>plugin.selfnamed.org.dspace.app.mediafilter.FormatFilter</code> item in <code>dspace.cfg</code>.
In addition, each Self-Named Filter <strong>must</strong> define the input formats for <em>each named plugin</em> defined by that filter.
In the above example the <code>MyComplexMediaFilter</code> class is assumed to have defined two named plugins, <code>Word2PDF</code> and <code>Excel2CSV</code>.
So, these two valid plugin names ("Word2PDF" and "Excel2CSV") <strong>must</strong> be returned by the <code>getPluginNames()</code> method of the <code>MyComplexMediaFilter</code> class.</p>
<p>These named plugins take different input formats as defined above (see the corresponding <code>inputFormats</code> setting). <em>WARNING: If you neglect to define the <code>inputFormats</code> for
a particular named plugin, the <code>MediaFilterManager</code> will never call that plugin, since it will never find a bitstream which has a format
matching that plugin's input format(s).</em>
</p>
<p>For a particular Self-Named Filter, you are also welcome to define additional configuration settings in <code>dspace.cfg</code>.
To continue with our current example, each of our imaginary plugins actually results in a different output format (Word2PDF creates "Adobe PDF", while Excel2CSV creates "Comma Separated Values").
To allow this complex Media Filter to be even more configurable (especially across institutions, with potential different "Bitstream Format Registries"), you may
wish to allow for the output format to be customizable for each named plugin. For example:</p>
<p><code><pre>
#Define output formats for each named plugin
filter.org.dspace.app.mediafilter.MyComplexMediaFilter.Word2PDF.outputFormat = Adobe PDF
filter.org.dspace.app.mediafilter.MyComplexMediaFilter.Excel2CSV.outputFormat = Comma Separated Values</pre>
</code></p>
<p>Any custom configuration fields in <code>dspace.cfg</code> defined by your filter are ignored by the <code>MediaFilterManager</code>, so it is up to your custom media filter class to read those configurations and apply them as necessary.
For example, you could use the following sample Java code in your <code>MyComplexMediaFilter</code> class to read these custom <code>outputFormat</code> configurations from <code>dspace.cfg</code> :
<code><pre>
//get "outputFormat" configuration from dspace.cfg
String outputFormat = ConfigurationManager.getProperty(MediaFilterManager.FILTER_PREFIX + "." +
MyComplexMediaFilter.class.getName() + "." + this.getPluginInstanceName() + ".outputFormat");</pre></code>
</p>
<h3><a name="runningfilter">Running Media/Format Filter</a></h3>
<p>Information on scheduling Media/Format Filters to run is available in the section:
<a href="application.html#mediafilters">MediaFilters: Transforming DSpace Content</a></p>
<h2><a name="statistics" id="statistics">Configuring System Statistical Reports</a></h2>
@@ -1260,8 +1364,8 @@ stat-report-monthly
<p>To activate this feature and display a preview image on the item page (all properties mentioned below
are found in <code>dspace.cfg</code>):</p>
<ol>
<li>Uncomment the lines defining the list of configured mediafilters at
<code>plugin.sequence.org.dspace.app.mediafilter.MediaFilter</code> to include <code>org.dspace.app.mediafilter.BrandedPreviewJPEGFilter</code>.</li>
<li>Uncomment the lines defining the list of enabled mediafilters at
<code>filter.plugins</code> to include <code>Branded Preview JPEG</code>.</li>
<li>Set the maximum pixel dimensions for the preview image by altering the <code>webui.preview.maxwidth</code> and <code>webui.preview.maxheight</code> (default is 600) config items.</li>
<li>Set the <code>webui.preview.brand</code> to the text you want to brand the image with. The brand will appear as white text on a black background strip across the base of the image. For example you might set the text to the owning organisation. The handle is also displayed as part of the branding.</li>
<li>Set the <code>webui.preview.brand.abbrev</code>. This is an abbreviated form of the <code>webui.preview.brand</code> text and will be shown where the brand text is longer than the image width (e.g. for narrow images).</li>
@@ -1282,7 +1386,7 @@ stat-report-monthly
<p><code>webui.browse.thumbnail.show = true</code></p>
<p>If set to <code>false</code> or this configuration item is missing then thumbnails will not be shown. Additionally, appropriate <A HREF="application.html#mediafilters">media filters</a> must be configured and the media filter configured to run periodically (for example, via a 'cron' job) </p>
<p>If set to <code>false</code> or this configuration item is missing then thumbnails will not be shown. Additionally, the appropriate <A HREF="#mediafilters">"JPEG Thumbnail" media filter</a> must be configured and the <a href="application.html#mediafilters">filter-media</a> script configured to run periodically (for example, via a 'cron' job) </p>
<p>The size of the browse/search thumbnails can also be configured to a smaller size than that generated by the mediafilter. To do this set the following configuration items:</p>
@@ -1586,7 +1690,7 @@ bin/checker -L # Loops continuously through the repository</pre>
<li>passing in a properties file containing retention policies when using the -p option. </li>
</ol>
<p>Pruning is controlled by a number of properties, each of which describes a checksum result code, and the length of time for which results with that code should be retained. The format is <code>checker.retention.[RESULT CODE]=[duration]. For example: -</p>
<p>Pruning is controlled by a number of properties, each of which describes a checksum result code, and the length of time for which results with that code should be retained. The format is <code>checker.retention.[RESULT CODE]=[duration]</code>. For example: -</p>
<pre>checker.retention.CHECKSUM_MATCH=8w</pre>

View File

@@ -86,7 +86,7 @@
<LI><A HREF="configure.html#ldap">Configuring LDAP Authentication</A></LI>
<LI><A HREF="configure.html#search-index">Configuring Lucene Search Indexes</A></LI>
<LI><A HREF="configure.html#statistics">Configuring System Statistical Reports</A></LI>
<LI><A HREF="configure.html#mediafilters">MediaFilters</A></LI>
<LI><A HREF="configure.html#mediafilters">Configuring MediaFilters</A></LI>
<LI><A HREF="configure.html#preview">Displaying Image Item Preview</A></LI>
<LI><A HREF="configure.html#webuithumbs">Displaying Image Thumbnails</A></LI>
<LI><A HREF="configure.html#strengths">Displaying Community and Collection Item Counts</A></LI>