diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/BrandedPreviewJPEGFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/BrandedPreviewJPEGFilter.java index fbbd799585..027e629372 100644 --- a/dspace-api/src/main/java/org/dspace/app/mediafilter/BrandedPreviewJPEGFilter.java +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/BrandedPreviewJPEGFilter.java @@ -182,7 +182,7 @@ public class BrandedPreviewJPEGFilter extends MediaFilter Brand brand = new Brand((int) xsize, brandHeight, new Font(brandFont, Font.PLAIN, brandFontPoint), 5); BufferedImage brandImage = brand.create(ConfigurationManager.getProperty("webui.preview.brand"), ConfigurationManager.getProperty("webui.preview.brand.abbrev"), - item == null ? "" : "hdl:" + item.getHandle()); + MediaFilterManager.getCurrentItem() == null ? "" : "hdl:" + MediaFilterManager.getCurrentItem().getHandle()); g2d.drawImage(brandImage, (int)0, (int)ysize, (int) xsize, (int) 20, null); diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/FormatFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/FormatFilter.java new file mode 100644 index 0000000000..30fb6f476d --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/FormatFilter.java @@ -0,0 +1,135 @@ +/* + * FormatFilter.java + * + * Version: $Revision: 1491 $ + * + * Date: $Date: 2006-03-29 21:46:42 -0500 (Wed, 29 Mar 2006) $ + * + * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts + * Institute of Technology. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of the Hewlett-Packard Company nor the name of the + * Massachusetts Institute of Technology nor the names of their + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ +package org.dspace.app.mediafilter; + +import java.io.InputStream; + +import org.dspace.content.Bitstream; +import org.dspace.content.Item; +import org.dspace.core.Context; + +/** + * Public interface for any class which transforms or converts content/bitstreams + * from one format to another. This interface should be implemented by any class + * which defines a "filter" to be run by the MediaFilterManager. + */ +public interface FormatFilter +{ + /** + * Get a filename for a newly created filtered bitstream + * + * @param sourceName + * name of source bitstream + * @return filename generated by the filter - for example, document.pdf + * becomes document.pdf.txt + */ + public String getFilteredName(String sourceName); + + /** + * @return name of the bundle this filter will stick its generated + * Bitstreams + */ + public String getBundleName(); + + /** + * @return name of the bitstream format (say "HTML" or "Microsoft Word") + * returned by this filter look in the bitstream format registry or + * mediafilter.cfg for valid format strings. + */ + public String getFormatString(); + + /** + * @return string to describe the newly-generated Bitstream's - how it was + * produced is a good idea + */ + public String getDescription(); + + /** + * @param source + * input stream + * + * @return result of filter's transformation, written out to a bitstream + */ + public InputStream getDestinationStream(InputStream source) + throws Exception; + + /** + * Perform any pre-processing of the source bitstream *before* the actual + * filtering takes place in MediaFilterManager.processBitstream(). + *

+ * Return true if pre-processing is successful (or no pre-processing + * is necessary). Return false if bitstream should be skipped + * for any reason. + * + * + * @param c + * context + * @param item + * item containing bitstream to process + * @param source + * source bitstream to be processed + * + * @return true if bitstream processing should continue, + * false if this bitstream should be skipped + */ + public boolean preProcessBitstream(Context c, Item item, Bitstream source) + throws Exception; + + /** + * Perform any post-processing of the generated bitstream *after* this + * filter has already been run. + *

+ * Return true if pre-processing is successful (or no pre-processing + * is necessary). Return false if bitstream should be skipped + * for some reason. + * + * + * @param c + * context + * @param item + * item containing bitstream to process + * @param generatedBitstream + * the bitstream which was generated by + * this filter. + */ + public void postProcessBitstream(Context c, Item item, Bitstream generatedBitstream) + throws Exception; +} + diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilter.java index 5b6da03a27..e2b13c5c4b 100644 --- a/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilter.java +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilter.java @@ -39,156 +39,64 @@ */ package org.dspace.app.mediafilter; -import java.io.InputStream; - import org.dspace.content.Bitstream; -import org.dspace.content.BitstreamFormat; -import org.dspace.content.Bundle; import org.dspace.content.Item; import org.dspace.core.Context; -public abstract class MediaFilter + +/** + * Abstract class which defines the default settings for a *simple* Media or Format Filter. + * This class may be extended by any class which wishes to define a simple filter to be run + * by the MediaFilterManager. More complex filters should likely implement the FormatFilter + * interface directly, so that they can define their own pre/postProcessing methods. + */ +public abstract class MediaFilter implements FormatFilter { - protected Item item = null; - - /* To create your own filter, implement the following virtual methods */ - - /** - * Get a filename for a newly created filtered bitstream + /** + * Perform any pre-processing of the source bitstream *before* the actual + * filtering takes place in MediaFilterManager.processBitstream(). + *

+ * Return true if pre-processing is successful (or no pre-processing + * is necessary). Return false if bitstream should be skipped + * for any reason. * - * @param sourceName - * name of source bitstream - * @return filename generated by the filter - for example, document.pdf - * becomes document.pdf.txt - */ - public abstract String getFilteredName(String sourceName); - - /** - * @return name of the bundle this filter will stick its generated - * Bitstreams - */ - public abstract String getBundleName(); - - /** - * @return name of the bitstream format (say "HTML" or "Microsoft Word") - * returned by this filter look in the bitstream format registry or - * mediafilter.cfg for valid format strings. - */ - public abstract String getFormatString(); - - /** - * @return string to describe the newly-generated Bitstream's - how it was - * produced is a good idea - */ - public abstract String getDescription(); - - /** - * @param source - * input stream - * - * @return result of filter's transformation, written out to a bitstream - */ - public abstract InputStream getDestinationStream(InputStream source) - throws Exception; - - /* end of methods you need to implement! */ - - /** - * processBitstream is a utility class that calls the above virtual methods - - * it is unlikely that you will need to override it. It scans the bitstreams - * in an item, and decides if a bitstream has already been filtered, and if - * not or if overWrite is set, invokes the filter. * * @param c * context * @param item * item containing bitstream to process * @param source - * source bitstream to process + * source bitstream to be processed * - * @return true if new rendition is created, false if rendition already - * exists and overWrite is not set + * @return true if bitstream processing should continue, + * false if this bitstream should be skipped */ - public boolean processBitstream(Context c, Item item, Bitstream source) + public boolean preProcessBitstream(Context c, Item item, Bitstream source) throws Exception { - boolean overWrite = MediaFilterManager.isForce; - - this.item = item; - - // get bitstream filename, calculate destination filename - String newName = getFilteredName(source.getName()); - - Bitstream existingBitstream = null; // is there an existing rendition? - Bundle targetBundle = null; // bundle we're modifying - - Bundle[] bundles = item.getBundles(getBundleName()); - - // check if destination bitstream exists - if (bundles.length > 0) - { - // only finds the last match (FIXME?) - for (int i = 0; i < bundles.length; i++) - { - Bitstream[] bitstreams = bundles[i].getBitstreams(); - - for (int j = 0; j < bitstreams.length; j++) - { - if (bitstreams[j].getName().equals(newName)) - { - targetBundle = bundles[i]; - existingBitstream = bitstreams[j]; - } - } - } - } - - // if exists and overwrite = false, exit - if (!overWrite && (existingBitstream != null)) - { - System.out.println("SKIPPED: bitstream " + source.getID() - + " because '" + newName + "' already exists"); - - return false; - } - - InputStream destStream = getDestinationStream(source.retrieve()); - - // create new bundle if needed - if (bundles.length < 1) - { - targetBundle = item.createBundle(getBundleName()); - } - else - { - // take the first match - targetBundle = bundles[0]; - } - - Bitstream b = targetBundle.createBitstream(destStream); - - // Now set the format and name of the bitstream - b.setName(newName); - b.setSource("Written by MediaFilter " + this.getClass().getName()); // or - // obj.getClass().getName(); - b.setDescription(getDescription()); - - // Find the proper format - BitstreamFormat bf = BitstreamFormat.findByShortDescription(c, - getFormatString()); - b.setFormat(bf); - b.update(); - - // fixme - set date? - // we are overwriting, so remove old bitstream - if (existingBitstream != null) - { - targetBundle.removeBitstream(existingBitstream); - } - - System.out.println("FILTERED: bitstream " + source.getID() - + " and created '" + newName + "'"); - - return true; + return true; //default to no pre-processing + } + + /** + * Perform any post-processing of the generated bitstream *after* this + * filter has already been run. + *

+ * Return true if pre-processing is successful (or no pre-processing + * is necessary). Return false if bitstream should be skipped + * for some reason. + * + * + * @param c + * context + * @param item + * item containing bitstream to process + * @param generatedBitstream + * the bitstream which was generated by + * this filter. + */ + public void postProcessBitstream(Context c, Item item, Bitstream generatedBitstream) + throws Exception + { + //default to no post-processing necessary } } diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilterManager.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilterManager.java index b3b4c1b6e7..9f2a1601ee 100644 --- a/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilterManager.java +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilterManager.java @@ -40,22 +40,30 @@ package org.dspace.app.mediafilter; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Enumeration; import java.util.HashMap; import java.util.Map; import java.util.List; -import java.util.Arrays; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.MissingArgumentException; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; +import org.dspace.authorize.AuthorizeManager; import org.dspace.content.Bitstream; import org.dspace.content.BitstreamFormat; import org.dspace.content.Bundle; import org.dspace.content.Collection; import org.dspace.content.Community; +import org.dspace.content.DCDate; import org.dspace.content.DSpaceObject; import org.dspace.content.Item; import org.dspace.content.ItemIterator; @@ -63,11 +71,12 @@ import org.dspace.core.ConfigurationManager; import org.dspace.core.Constants; import org.dspace.core.Context; import org.dspace.core.PluginManager; +import org.dspace.core.SelfNamedPlugin; import org.dspace.handle.HandleManager; import org.dspace.search.DSIndexer; /** - * MediaFilterManager is the class that invokes the media filters over the + * MediaFilterManager is the class that invokes the media/format filters over the * repository's content. a few command line flags affect the operation of the * MFM: -v verbose outputs all extracted text to STDOUT; -f force forces all * bitstreams to be processed, even if they have been before; -n noindex does not @@ -77,6 +86,15 @@ import org.dspace.search.DSIndexer; */ public class MediaFilterManager { + //key (in dspace.cfg) which lists all enabled filters by name + public static String MEDIA_FILTER_PLUGINS_KEY = "filter.plugins"; + + //prefix (in dspace.cfg) for all filter properties + public static String FILTER_PREFIX = "filter"; + + //suffix (in dspace.cfg) for input formats supported by each filter + public static String INPUT_FORMATS_SUFFIX = "inputFormats"; + public static boolean updateIndex = true; // default to updating index public static boolean isVerbose = false; // default to not verbose @@ -85,14 +103,20 @@ public class MediaFilterManager public static String identifier = null; // object scope limiter - public static int max2Process = Integer.MAX_VALUE; // maximum number to process + public static int max2Process = Integer.MAX_VALUE; // maximum number items to process - public static int processed = 0; // number processed + public static int processed = 0; // number items processed - private static MediaFilter[] filterClasses = null; + private static Item currentItem = null; // current item being processed + + private static FormatFilter[] filterClasses = null; private static Map filterFormats = new HashMap(); + //separator in filterFormats Map between a filter class name and a plugin name, + //for MediaFilters which extend SelfNamedPlugin (\034 is "file separator" char) + public static String FILTER_PLUGIN_SEPARATOR = "\034"; + public static void main(String[] argv) throws Exception { // set headless for non-gui workstations @@ -117,12 +141,35 @@ public class MediaFilterManager "process no more than maximum items"); options.addOption("h", "help", false, "help"); - CommandLine line = parser.parse(options, argv); + //create a "plugin" option (to specify specific MediaFilter plugins to run) + OptionBuilder.withLongOpt("plugins"); + OptionBuilder.withValueSeparator(','); + OptionBuilder.withDescription( + "ONLY run the specified Media Filter plugin(s)\n" + + "listed from '" + MEDIA_FILTER_PLUGINS_KEY + "' in dspace.cfg.\n" + + "Separate multiple with a comma (,)\n" + + "(e.g. MediaFilterManager -p \n\"Word Text Extraction\",\"PDF Text Extraction\")"); + Option pluginOption = OptionBuilder.create('p'); + pluginOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args + options.addOption(pluginOption); + + CommandLine line = null; + try + { + line = parser.parse(options, argv); + } + catch(MissingArgumentException e) + { + System.out.println("ERROR: " + e.getMessage()); + HelpFormatter myhelp = new HelpFormatter(); + myhelp.printHelp("MediaFilterManager\n", options); + System.exit(1); + } if (line.hasOption('h')) { HelpFormatter myhelp = new HelpFormatter(); - myhelp.printHelp("MediaFilter\n", options); + myhelp.printHelp("MediaFilterManager\n", options); System.exit(0); } @@ -158,20 +205,110 @@ public class MediaFilterManager } } - // set up filters - filterClasses = - (MediaFilter[])PluginManager.getPluginSequence(MediaFilter.class); - for (int i = 0; i < filterClasses.length; i++) + String filterNames[] = null; + if(line.hasOption('p')) { - String filterName = filterClasses[i].getClass().getName(); - String formats = ConfigurationManager.getProperty( - "filter." + filterName + ".inputFormats"); - if (formats != null) - { - filterFormats.put(filterName, Arrays.asList(formats.split(",[\\s]*"))); - } - } + //specified which media filter plugins we are using + filterNames = line.getOptionValues('p'); + if(filterNames==null || filterNames.length==0) + { //display error, since no plugins specified + System.err.println("\nERROR: -p (-plugin) option requires at least one plugin to be specified.\n" + + "(e.g. MediaFilterManager -p \"Word Text Extractor\",\"PDF Text Extractor\")\n"); + HelpFormatter myhelp = new HelpFormatter(); + myhelp.printHelp("MediaFilterManager\n", options); + System.exit(1); + } + } + else + { + //retrieve list of all enabled media filter plugins! + String enabledPlugins = ConfigurationManager.getProperty(MEDIA_FILTER_PLUGINS_KEY); + filterNames = enabledPlugins.split(",\\s*"); + } + + //initialize an array of our enabled filters + List filterList = new ArrayList(); + + //set up each filter + for(int i=0; i< filterNames.length; i++) + { + //get filter of this name & add to list of filters + FormatFilter filter = (FormatFilter) PluginManager.getNamedPlugin(FormatFilter.class, filterNames[i]); + if(filter==null) + { + System.err.println("\nERROR: Unknown MediaFilter specified (either from command-line or in dspace.cfg): '" + filterNames[i] + "'"); + System.exit(1); + } + else + { + filterList.add(filter); + + String filterClassName = filter.getClass().getName(); + + String pluginName = null; + + //If this filter is a SelfNamedPlugin, + //then the input formats it accepts may differ for + //each "named" plugin that it defines. + //So, we have to look for every key that fits the + //following format: filter...inputFormats + if( SelfNamedPlugin.class.isAssignableFrom(filter.getClass()) ) + { + //Get the plugin instance name for this class + pluginName = ((SelfNamedPlugin) filter).getPluginInstanceName(); + } + + + //Retrieve our list of supported formats from dspace.cfg + //For SelfNamedPlugins, format of key is: + // filter...inputFormats + //For other MediaFilters, format of key is: + // filter..inputFormats + String formats = ConfigurationManager.getProperty( + FILTER_PREFIX + "." + filterClassName + + (pluginName!=null ? "." + pluginName : "") + + "." + INPUT_FORMATS_SUFFIX); + + //add to internal map of filters to supported formats + if (formats != null) + { + //For SelfNamedPlugins, map key is: + // + //For other MediaFilters, map key is just: + // + filterFormats.put(filterClassName + + (pluginName!=null ? FILTER_PLUGIN_SEPARATOR + pluginName : ""), + Arrays.asList(formats.split(",[\\s]*"))); + } + }//end if filter!=null + }//end for + + //If verbose, print out loaded mediafilter info + if(isVerbose) + { + System.out.println("The following MediaFilters are enabled: "); + java.util.Iterator i = filterFormats.keySet().iterator(); + while(i.hasNext()) + { + String filterName = (String) i.next(); + System.out.println("Full Filter Name: " + filterName); + String pluginName = null; + if(filterName.contains(FILTER_PLUGIN_SEPARATOR)) + { + String[] fields = filterName.split(FILTER_PLUGIN_SEPARATOR); + filterName=fields[0]; + pluginName=fields[1]; + } + + System.out.println(filterName + + (pluginName!=null? " (Plugin: " + pluginName + ")": "")); + } + } + + //store our filter list into an internal array + filterClasses = (MediaFilter[]) filterList.toArray(new MediaFilter[filterList.size()]); + Context c = null; try @@ -270,6 +407,11 @@ public class MediaFilterManager public static void applyFiltersItem(Context c, Item item) throws Exception { + //cache this item in MediaFilterManager + //so it can be accessed by MediaFilters as necessary + currentItem = item; + + if (filterItem(c, item)) { // commit changes after each filtered item @@ -277,8 +419,9 @@ public class MediaFilterManager // increment processed count ++processed; } - // clear item objects from context cache + // clear item objects from context cache and internal cache item.decache(); + currentItem = null; } /** @@ -325,14 +468,33 @@ public class MediaFilterManager // by more than one filter for (int i = 0; i < filterClasses.length; i++) { - List fmts = (List)filterFormats.get(filterClasses[i].getClass().getName()); - if (fmts.contains(myBitstream.getFormat().getShortDescription())) + //List fmts = (List)filterFormats.get(filterClasses[i].getClass().getName()); + String pluginName = null; + + //if this filter class is a SelfNamedPlugin, + //its list of supported formats is different for + //differently named "plugin" + if( SelfNamedPlugin.class.isAssignableFrom(filterClasses[i].getClass()) ) + { + //get plugin instance name for this media filter + pluginName = ((SelfNamedPlugin)filterClasses[i]).getPluginInstanceName(); + } + + //Get list of supported formats for the filter (and possibly named plugin) + //For SelfNamedPlugins, map key is: + // + //For other MediaFilters, map key is just: + // + List fmts = (List)filterFormats.get(filterClasses[i].getClass().getName() + + (pluginName!=null ? FILTER_PLUGIN_SEPARATOR + pluginName : "")); + + if (fmts.contains(myBitstream.getFormat().getShortDescription())) { try { // only update item if bitstream not skipped - if (filterClasses[i].processBitstream(c, myItem, myBitstream)) - { + if (processBitstream(c, myItem, myBitstream, filterClasses[i])) + { myItem.update(); // Make sure new bitstream has a sequence // number filtered = true; @@ -348,4 +510,135 @@ public class MediaFilterManager } return filtered; } + + /** + * processBitstream is a utility class that calls the virtual methods + * from the current MediaFilter class. + * It scans the bitstreams in an item, and decides if a bitstream has + * already been filtered, and if not or if overWrite is set, invokes the + * filter. + * + * @param c + * context + * @param item + * item containing bitstream to process + * @param source + * source bitstream to process + * @param mediaFilter + * MediaFilter to perform filtering + * + * @return true if new rendition is created, false if rendition already + * exists and overWrite is not set + */ + public static boolean processBitstream(Context c, Item item, Bitstream source, FormatFilter formatFilter) + throws Exception + { + //do pre-processing of this bitstream, and if it fails, skip this bitstream! + if(!formatFilter.preProcessBitstream(c, item, source)) + return false; + + boolean overWrite = MediaFilterManager.isForce; + + // get bitstream filename, calculate destination filename + String newName = formatFilter.getFilteredName(source.getName()); + + Bitstream existingBitstream = null; // is there an existing rendition? + Bundle targetBundle = null; // bundle we're modifying + + Bundle[] bundles = item.getBundles(formatFilter.getBundleName()); + + // check if destination bitstream exists + if (bundles.length > 0) + { + // only finds the last match (FIXME?) + for (int i = 0; i < bundles.length; i++) + { + Bitstream[] bitstreams = bundles[i].getBitstreams(); + + for (int j = 0; j < bitstreams.length; j++) + { + if (bitstreams[j].getName().equals(newName)) + { + targetBundle = bundles[i]; + existingBitstream = bitstreams[j]; + } + } + } + } + + // if exists and overwrite = false, exit + if (!overWrite && (existingBitstream != null)) + { + System.out.println("SKIPPED: bitstream " + source.getID() + + " because '" + newName + "' already exists"); + + return false; + } + + InputStream destStream = formatFilter.getDestinationStream(source.retrieve()); + + // create new bundle if needed + if (bundles.length < 1) + { + targetBundle = item.createBundle(formatFilter.getBundleName()); + } + else + { + // take the first match + targetBundle = bundles[0]; + } + + Bitstream b = targetBundle.createBitstream(destStream); + + // Now set the format and name of the bitstream + b.setName(newName); + b.setSource("Written by FormatFilter " + formatFilter.getClass().getName() + + " on " + DCDate.getCurrent() + " (GMT)."); + b.setDescription(formatFilter.getDescription()); + + // Find the proper format + BitstreamFormat bf = BitstreamFormat.findByShortDescription(c, + formatFilter.getFormatString()); + b.setFormat(bf); + b.update(); + + //UIUC change - inherit policies from the source bitstream + //(first remove any existing policies) + AuthorizeManager.removeAllPolicies(c, b); + AuthorizeManager.inheritPolicies(c, source, b); + + // fixme - set date? + // we are overwriting, so remove old bitstream + if (existingBitstream != null) + { + targetBundle.removeBitstream(existingBitstream); + } + + System.out.println("FILTERED: bitstream " + source.getID() + + " and created '" + newName + "'"); + + //do post-processing of the generated bitstream + formatFilter.postProcessBitstream(c, item, b); + + return true; + } + + /** + * Return the item that is currently being processed/filtered + * by the MediaFilterManager + *

+ * This allows FormatFilters to retrieve the Item object + * in case they need access to item-level information for their format + * transformations/conversions. + * + * @return current Item being processed by MediaFilterManager + */ + public static Item getCurrentItem() + { + return currentItem; + } + + + /** END UIUC Change **/ + } \ No newline at end of file diff --git a/dspace/CHANGES b/dspace/CHANGES index 4a309f834e..abcdd1cb8c 100644 --- a/dspace/CHANGES +++ b/dspace/CHANGES @@ -61,6 +61,7 @@ SF Patch 1794700 Bug fix for stat-monthly and stat-report-monthly (Tim Donohue) - Added Configurable Submission to JSP-UI and XML-UI (updated version of SF Patch #1691345) +- SF Patch #1589429 "Self-Named" Media Filters (i.e. MediaFilter Plugins) (updated version of this patch) (Stuart Lewis) - SF Patch #1737792 Patch for bug 1552760 - Submit interface looks bad in Safari diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg index 5a26f7395f..9ed1355c55 100644 --- a/dspace/config/dspace.cfg +++ b/dspace/config/dspace.cfg @@ -784,15 +784,25 @@ ldap.enable = false # will be left blank in the new eperson object. #ldap.phone_field = telephoneNumber -#### Media Filter plugins (through PluginManager) #### -plugin.sequence.org.dspace.app.mediafilter.MediaFilter = \ - org.dspace.app.mediafilter.PDFFilter, org.dspace.app.mediafilter.HTMLFilter, \ - org.dspace.app.mediafilter.WordFilter, org.dspace.app.mediafilter.JPEGFilter -# to enable branded preview: remove last line above, and uncomment 2 lines below -# org.dspace.app.mediafilter.WordFilter, org.dspace.app.mediafilter.JPEGFilter, \ -# org.dspace.app.mediafilter.BrandedPreviewJPEGFilter - +#### Media Filter / Format Filter plugins (through PluginManager) #### + +#Names of the enabled MediaFilter or FormatFilter plugins +filter.plugins = PDF Text Extractor, HTML Text Extractor, \ + Word Text Extractor, JPEG Thumbnail +# [To enable Branded Preview]: remove last line above, and uncomment 2 lines below +# Word Text Extractor, JPEG Thumbnail, \ +# Branded Preview JPEG + +#Assign 'human-understandable' names to each filter +plugin.named.org.dspace.app.mediafilter.FormatFilter = \ + org.dspace.app.mediafilter.PDFFilter = PDF Text Extractor, \ + org.dspace.app.mediafilter.HTMLFilter = HTML Text Extractor, \ + org.dspace.app.mediafilter.WordFilter = Word Text Extractor, \ + org.dspace.app.mediafilter.JPEGFilter = JPEG Thumbnail, \ + org.dspace.app.mediafilter.BrandedPreviewJPEGFilter = Branded Preview JPEG + +#Configure each filter's input format(s) filter.org.dspace.app.mediafilter.PDFFilter.inputFormats = Adobe PDF filter.org.dspace.app.mediafilter.HTMLFilter.inputFormats = HTML, Text filter.org.dspace.app.mediafilter.WordFilter.inputFormats = Microsoft Word diff --git a/dspace/docs/application.html b/dspace/docs/application.html index 62108cb2a9..88cfd27df8 100644 --- a/dspace/docs/application.html +++ b/dspace/docs/application.html @@ -949,41 +949,64 @@ dsrun org.dspace.app.itemimport.ItemImport -a -e joe@user.com -c collectionID -

MediaFilters: Transforming DSpace Content

-

DSpace can apply filters to content/bitstreams, creating new content. Filters are included that extract text for full-text searching, and create thumbnails for items that contain images. The media filters are controlled by the MediaFilterManager which traverses the asset store, invoking the MediaFilter subclasses on bitstreams. The MediaFilter plugin config item plugin.named.org.dspace.app.mediafilter.MediaFilter in dspace.cfg contains a list of bitstream format types and the filters that operate on bitstreams of that type. The media filter system is intended to be run from the command line (or regularly as a cron task):

+

DSpace can apply filters to content/bitstreams, creating new content. Filters are included that extract text for full-text searching, and create thumbnails for items that contain images. + The media filters are controlled by the MediaFilterManager which traverses the asset store, invoking the MediaFilter or FormatFilter classes on bitstreams. + The media filter plugin configuration filter.plugins in dspace.cfg contains a list of all enabled media/format filter plugins (see Configuring Media Filters for more information). + The media filter system is intended to be run from the command line (or regularly as a cron task):

-dspace/bin/filter-media
+[dspace]/bin/filter-media
 
-

Traverse the asset store, applying media filters to bitstreams, skipping bitstreams that have already been filtered.

-
-dspace/bin/filter-media -f
-
- -

Apply filters to ALL bitstreams, even if they've already been filtered.

-
-dspace/bin/filter-media -v
-
- -

Verbose mode - print all extracted text and other filter details to STDOUT.

-
-dspace/bin/filter-media -n
-
- -

Suppress index creation - by default, a new search index is created for full-text searching. This option suppresses index creation if you intend to run index-all elsewhere.

+

With no options, this traverses the asset store, applying media filters to bitstreams, and skipping bitstreams that have already been filtered.

-
-dspace/bin/filter-media -i 123456789/2
-
+

Available Command-Line Options:

-

Restrict processing to the community, collection, or item named by the identifier - by default, all bitstreams of all items in the repository are processed. The identifier must be a handle, not a DB key. This option may be combined with any other option.

+ -

Adding your own filters is done by creating a sub-class of the MediaFilter class. See the comments in the source file MediaFilter.java for more information. In theory filters could be implemented in any language (C, Perl, etc.) They only need to be invoked by the Java code in the MediaFilter class that you create.

+

Adding your own filters is done by creating a class which implements the org.dspace.app.mediafilter.FormatFilter interface. See the Creating a new Media Filter topic and comments in the source file FormatFilter.java for more information. In theory filters could be implemented in any programming language (C, Perl, etc.) However, they need to be invoked by the Java code in the Media Filter class that you create.

Sub-Community Management

diff --git a/dspace/docs/business.html b/dspace/docs/business.html index 4342a6aa55..4d22dd6ae0 100644 --- a/dspace/docs/business.html +++ b/dspace/docs/business.html @@ -425,7 +425,7 @@ packager plugin's implementation.

See the getNamedPlugin() method and the getPluginNames() methods.

-

Self-Named Plugins

+

Self-Named Plugins

Named plugins can get their names either from the configuration or, for a variant called self-named plugins, from within the plugin itself.

@@ -513,7 +513,7 @@ packager plugin's implementation.

SelfNamedPlugin Class

-

A named plugin implementation must extend this class if it wants to supply its own Plugin Name(s). See Self-Named Plugins for why this is sometimes necessary.

+

A named plugin implementation must extend this class if it wants to supply its own Plugin Name(s). See Self-Named Plugins for why this is sometimes necessary.

abstract class SelfNamedPlugin
 {
@@ -544,11 +544,11 @@ packager plugin's implementation.

This is a RuntimeException so it doesn't have to be declared, and can be passed all the way up to a generalized fatal exception handler.

-

Configuring Plugins

+

Configuring Plugins

All of the Plugin Manager's configuration comes from the DSpace Configuration Manager, which is a Java Properties map. You can configure these characteristics of each plugin:

    -
  1. Interface: Classname of the Java interface which defines the plugin, including package name. e.g. org.dspace.app.mediafilter.MediaFilter
  2. +
  3. Interface: Classname of the Java interface which defines the plugin, including package name. e.g. org.dspace.app.mediafilter.FormatFilter
  4. Implementation Class: Classname of the implementation class, including package. e.g. org.dspace.app.mediafilter.PDFFilter
  5. Names: (Named plugins only) There are two ways to bind names to plugins: listing them in the value of a plugin.named.interface key, or configuring a class in plugin.selfnamed.interface which extends the SelfNamedPlugin class.
  6. Reusable option: (Optional) This is declared in a plugin.reusable configuration line. Plugins are reusable by default, so you only need to configure the non-reusable ones.
  7. diff --git a/dspace/docs/configure.html b/dspace/docs/configure.html index 24274846e9..d796cb2c6b 100644 --- a/dspace/docs/configure.html +++ b/dspace/docs/configure.html @@ -1141,28 +1141,132 @@ search.index.11 = id:dc.identifier.*

    NOTE: While the indexes are created, this only affects the search results and has no effect on the search components of the user interface. To add new search capability (e.g. to add a new search category to the Advanced Search) requires local customisation to the user interface.

    -

    Media Filters

    +

    Configuring Media Filters

    -

    Media Filters are classes used to generate derivative or alternative versions of master bitstreams. For example, the PDF Media Filter will extract textual content from PDF bitstreams, the JPEG Media Filter can create thumbnails from image bitstreams.

    +

    Media or Format Filters are classes used to generate derivative or alternative versions of content or bitstreams within DSpace. For example, the PDF Media Filter will extract textual content from PDF bitstreams, the JPEG Media Filter can create thumbnails from image bitstreams.

    -

    Media Filters are configured as a Sequence Plugin, with each filter also having a separate config item indicating which formats it can process. The default configuration is shown below.

    +

    Media Filters are configured as Named Plugins, with each filter also having a separate configuration setting (in dspace.cfg) indicating which formats it can process. The default configuration is shown below.

    -

    #### Media Filter plugins (through PluginManager) ####
    +

    +	#### Media Filter / Format Filter plugins (through PluginManager) ####
     
    -plugin.sequence.org.dspace.app.mediafilter.MediaFilter = \
    -    org.dspace.app.mediafilter.PDFFilter, org.dspace.app.mediafilter.HTMLFilter, \
    -    org.dspace.app.mediafilter.WordFilter, org.dspace.app.mediafilter.JPEGFilter
    -# to enable branded preview: remove last line above, and uncomment 2 lines below
    -#   org.dspace.app.mediafilter.WordFilter, org.dspace.app.mediafilter.JPEGFilter, \
    -#   org.dspace.app.mediafilter.BrandedPreviewJPEGFilter
    +	#Names of the enabled MediaFilter or FormatFilter plugins
    +	filter.plugins = PDF Text Extractor, HTML Text Extractor, \
    +						Word Text Extractor, JPEG Thumbnail
    +	# to enable branded preview: remove last line above, and uncomment 2 lines below
    +	#   					Word Text Extractor, JPEG Thumbnail, \
    +	#   					Branded Preview JPEG
     
    -filter.org.dspace.app.mediafilter.PDFFilter.inputFormats = Adobe PDF
    -filter.org.dspace.app.mediafilter.HTMLFilter.inputFormats = HTML, Text
    -filter.org.dspace.app.mediafilter.WordFilter.inputFormats = Microsoft Word
    -filter.org.dspace.app.mediafilter.JPEGFilter.inputFormats = GIF, JPEG, image/png
    -filter.org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.inputFormats = GIF, JPEG, image/png

    + #Assign 'human-understandable' names to each filter + plugin.named.org.dspace.app.mediafilter.FormatFilter = \ + org.dspace.app.mediafilter.PDFFilter = PDF Text Extractor, \ + org.dspace.app.mediafilter.HTMLFilter = HTML Text Extractor, \ + org.dspace.app.mediafilter.WordFilter = Word Text Extractor, \ + org.dspace.app.mediafilter.JPEGFilter = JPEG Thumbnail, \ + org.dspace.app.mediafilter.BrandedPreviewJPEGFilter = Branded Preview JPEG -

    To add a new Media Filter, add the new filter class to the plugin.sequence.org.dspace.app.mediafilter.MediaFilter config item and add a corresponding filter.<class path>.inputFormats config item. Note the input formats must match the short_description field in the bitstreamformatregistry table.

    + #Configure each filter's input format(s) + filter.org.dspace.app.mediafilter.PDFFilter.inputFormats = Adobe PDF + filter.org.dspace.app.mediafilter.HTMLFilter.inputFormats = HTML, Text + filter.org.dspace.app.mediafilter.WordFilter.inputFormats = Microsoft Word + filter.org.dspace.app.mediafilter.JPEGFilter.inputFormats = GIF, JPEG, image/png + filter.org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.inputFormats = GIF, JPEG, image/png

    + +

    The enabled Media/Format Filters are named in the filter.plugins field above.

    +

    Names are assigned to each filter using the plugin.named.org.dspace.app.mediafilter.FormatFilter field +(e.g. by default the PDFFilter is named "PDF Text Extractor").

    +

    Finally the appropriate filter.<class path>.inputFormats defines the vaild input formats which each filter can be applied to. These +format names must match the short description field of the Bitstream Format Registry.

    + +

    You can also implement more dynamic or configurable Media/Format Filters which extend SelfNamedPlugin. +More information is provide below in Creating a new Media/Format Filter

    + +

    Creating a new Media/Format Filter

    + +

    Creating a simple Media Filter

    +

    New Media Filters must implement the org.dspace.app.mediafilter.FormatFilter interface. More information on the methods you need to implement is provided in the FormatFilter.java source file. For example: +

    +	public class MySimpleMediaFilter implements FormatFilter
    +

    + +

    Alternatively, you could extend the org.dspace.app.mediafilter.MediaFilter class, which just defaults to performing no pre/post-processing of bitstreams before or after filtering. +

    +	public class MySimpleMediaFilter extends MediaFilter
    +

    + +

    You must give your new filter a "name", by adding it and its name to the plugin.named.org.dspace.app.mediafilter.FormatFilter field in dspace.cfg. + In addition to naming your filter, make sure to specify its input formats in the + filter.<class path>.inputFormats config item. Note the input formats must match the short description field in the Bitstream Format Registry (i.e. bitstreamformatregistry table). +

    +	plugin.named.org.dspace.app.mediafilter.FormatFilter = \
    +  		org.dspace.app.mediafilter.MySimpleMediaFilter = My Simple Text Filter, \
    +  		...
    +  	
    +  	filter.org.dspace.app.mediafilter.MySimpleMediaFilter.inputFormats = Text
    +

    + + WARNING: If you neglect to define the inputFormats for + a particular filter, the MediaFilterManager will never call that filter, since it will never find a bitstream which has a format + matching that filter's input format(s). + +

    If you have a complex Media Filter class, which actually performs different filtering for different formats (e.g. conversion from Word to PDF and conversion from Excel to CSV), you should define this as a Dynamic / Self-Named Format Filter. + +

    Creating a Dynamic or "Self-Named" Format Filter

    +

    If you have a more complex Media/Format Filter, which actually performs multiple filtering or conversions for different formats (e.g. conversion from Word to PDF and conversion from Excel to CSV), you should have define a class which implements the FormatFilter interface, + while also extending the SelfNamedPlugin class. For example: +

    +	public class MyComplexMediaFilter extends SelfNamedPlugin implements FormatFilter
    +

    + +

    Since SelfNamedPlugins are self-named (as stated), they must provide the various names the plugin uses by defining a getPluginNames() method. + Generally speaking, each "name" the plugin uses should correspond to a different type of filter it implements (e.g. "Word2PDF" and "Excel2CSV" are two good names for a complex media filter which performs both Word to PDF and Excel to CSV conversions). +

    + +

    Self-Named Media/Format Filters are also configured differently in dspace.cfg. Below is a general template for a Self Named Filter (defined by an imaginary MyComplexMediaFilter class, which + can perform both Word to PDF and Excel to CSV conversions):

    + +

    +	#Add to a list of all Self Named filters
    +	plugin.selfnamed.org.dspace.app.mediafilter.FormatFilter = \
    +  		org.dspace.app.mediafilter.MyComplexMediaFilter
    +  		
    +	#Define input formats for each "named" plugin this filter implements
    +	filter.org.dspace.app.mediafilter.MyComplexMediaFilter.Word2PDF.inputFormats = Microsoft Word
    +	filter.org.dspace.app.mediafilter.MyComplexMediaFilter.Excel2CSV.inputFormats = Microsoft Excel
    +

    + +

    As shown above, each Self-Named Filter class must be listed in the plugin.selfnamed.org.dspace.app.mediafilter.FormatFilter item in dspace.cfg. + In addition, each Self-Named Filter must define the input formats for each named plugin defined by that filter. + In the above example the MyComplexMediaFilter class is assumed to have defined two named plugins, Word2PDF and Excel2CSV. + So, these two valid plugin names ("Word2PDF" and "Excel2CSV") must be returned by the getPluginNames() method of the MyComplexMediaFilter class.

    +

    These named plugins take different input formats as defined above (see the corresponding inputFormats setting). WARNING: If you neglect to define the inputFormats for + a particular named plugin, the MediaFilterManager will never call that plugin, since it will never find a bitstream which has a format + matching that plugin's input format(s). +

    + +

    For a particular Self-Named Filter, you are also welcome to define additional configuration settings in dspace.cfg. + To continue with our current example, each of our imaginary plugins actually results in a different output format (Word2PDF creates "Adobe PDF", while Excel2CSV creates "Comma Separated Values"). + To allow this complex Media Filter to be even more configurable (especially across institutions, with potential different "Bitstream Format Registries"), you may + wish to allow for the output format to be customizable for each named plugin. For example:

    + +

    +	#Define output formats for each named plugin
    +	filter.org.dspace.app.mediafilter.MyComplexMediaFilter.Word2PDF.outputFormat = Adobe PDF
    +	filter.org.dspace.app.mediafilter.MyComplexMediaFilter.Excel2CSV.outputFormat = Comma Separated Values
    +

    + +

    Any custom configuration fields in dspace.cfg defined by your filter are ignored by the MediaFilterManager, so it is up to your custom media filter class to read those configurations and apply them as necessary. + For example, you could use the following sample Java code in your MyComplexMediaFilter class to read these custom outputFormat configurations from dspace.cfg : +

    +	//get "outputFormat" configuration from dspace.cfg
    +	String outputFormat = ConfigurationManager.getProperty(MediaFilterManager.FILTER_PREFIX + "." + 
    +		MyComplexMediaFilter.class.getName() + "." + this.getPluginInstanceName() + ".outputFormat");
    +

    + +

    Running Media/Format Filter

    + +

    Information on scheduling Media/Format Filters to run is available in the section: + MediaFilters: Transforming DSpace Content

    Configuring System Statistical Reports

    @@ -1260,8 +1364,8 @@ stat-report-monthly

    To activate this feature and display a preview image on the item page (all properties mentioned below are found in dspace.cfg):

      -
    1. Uncomment the lines defining the list of configured mediafilters at -plugin.sequence.org.dspace.app.mediafilter.MediaFilter to include org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.
    2. +
    3. Uncomment the lines defining the list of enabled mediafilters at +filter.plugins to include Branded Preview JPEG.
    4. Set the maximum pixel dimensions for the preview image by altering the webui.preview.maxwidth and webui.preview.maxheight (default is 600) config items.
    5. Set the webui.preview.brand to the text you want to brand the image with. The brand will appear as white text on a black background strip across the base of the image. For example you might set the text to the owning organisation. The handle is also displayed as part of the branding.
    6. Set the webui.preview.brand.abbrev. This is an abbreviated form of the webui.preview.brand text and will be shown where the brand text is longer than the image width (e.g. for narrow images).
    7. @@ -1282,7 +1386,7 @@ stat-report-monthly

      webui.browse.thumbnail.show = true

      -

      If set to false or this configuration item is missing then thumbnails will not be shown. Additionally, appropriate media filters must be configured and the media filter configured to run periodically (for example, via a 'cron' job)

      +

      If set to false or this configuration item is missing then thumbnails will not be shown. Additionally, the appropriate "JPEG Thumbnail" media filter must be configured and the filter-media script configured to run periodically (for example, via a 'cron' job)

      The size of the browse/search thumbnails can also be configured to a smaller size than that generated by the mediafilter. To do this set the following configuration items:

      @@ -1586,7 +1690,7 @@ bin/checker -L # Loops continuously through the repository
  • passing in a properties file containing retention policies when using the -p option.
  • -

    Pruning is controlled by a number of properties, each of which describes a checksum result code, and the length of time for which results with that code should be retained. The format is checker.retention.[RESULT CODE]=[duration]. For example: -

    +

    Pruning is controlled by a number of properties, each of which describes a checksum result code, and the length of time for which results with that code should be retained. The format is checker.retention.[RESULT CODE]=[duration]. For example: -

    checker.retention.CHECKSUM_MATCH=8w
    diff --git a/dspace/docs/index.html b/dspace/docs/index.html index e1dca65bf9..42db0e565d 100644 --- a/dspace/docs/index.html +++ b/dspace/docs/index.html @@ -86,7 +86,7 @@
  • Configuring LDAP Authentication
  • Configuring Lucene Search Indexes
  • Configuring System Statistical Reports
  • -
  • MediaFilters
  • +
  • Configuring MediaFilters
  • Displaying Image Item Preview
  • Displaying Image Thumbnails
  • Displaying Community and Collection Item Counts