From e891212fdf76a1a6802bedb35c12fa1b224a8791 Mon Sep 17 00:00:00 2001
From: Tim Donohue
Date: Fri, 30 Nov 2007 18:14:41 +0000
Subject: [PATCH] Small addition of a "Skip" flag to
MediaFilterManager/filter-media. Allows one to specific a list of identifiers
(communities/collections/items) to SKIP during filtering process. Updated
documentation to explain this new flag.
git-svn-id: http://scm.dspace.org/svn/repo/branches/dspace-1_5_x@2410 9c30dcfa-912a-0410-8fc2-9e0234be79fd
---
.../app/mediafilter/MediaFilterManager.java | 128 ++++++++++++++----
dspace/docs/application.html | 25 ++--
2 files changed, 120 insertions(+), 33 deletions(-)
diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilterManager.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilterManager.java
index f85b92d465..a579738e77 100644
--- a/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilterManager.java
+++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/MediaFilterManager.java
@@ -113,6 +113,8 @@ public class MediaFilterManager
private static Map filterFormats = new HashMap();
+ private static List skipList = null; //list of identifiers to skip during processing
+
//separator in filterFormats Map between a filter class name and a plugin name,
//for MediaFilters which extend SelfNamedPlugin (\034 is "file separator" char)
public static String FILTER_PLUGIN_SEPARATOR = "\034";
@@ -151,8 +153,19 @@ public class MediaFilterManager
"(e.g. MediaFilterManager -p \n\"Word Text Extractor\",\"PDF Text Extractor\")");
Option pluginOption = OptionBuilder.create('p');
pluginOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args
- options.addOption(pluginOption);
-
+ options.addOption(pluginOption);
+
+ //create a "skip" option (to specify communities/collections/items to skip)
+ OptionBuilder.withLongOpt("skip");
+ OptionBuilder.withValueSeparator(',');
+ OptionBuilder.withDescription(
+ "SKIP the bitstreams belonging to identifier\n" +
+ "Separate multiple identifiers with a comma (,)\n" +
+ "(e.g. MediaFilterManager -s \n 123456789/34,123456789/323)");
+ Option skipOption = OptionBuilder.create('s');
+ skipOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args
+ options.addOption(skipOption);
+
CommandLine line = null;
try
{
@@ -308,7 +321,29 @@ public class MediaFilterManager
//store our filter list into an internal array
filterClasses = (FormatFilter[]) filterList.toArray(new FormatFilter[filterList.size()]);
+
+
+ //Retrieve list of identifiers to skip (if any)
+ String skipIds[] = null;
+ if(line.hasOption('s'))
+ {
+ //specified which identifiers to skip when processing
+ skipIds = line.getOptionValues('s');
+ if(skipIds==null || skipIds.length==0)
+ { //display error, since no identifiers specified to skip
+ System.err.println("\nERROR: -s (-skip) option requires at least one identifier to SKIP.\n" +
+ "Make sure to separate multiple identifiers with a comma!\n" +
+ "(e.g. MediaFilterManager -s 123456789/34,123456789/323)\n");
+ HelpFormatter myhelp = new HelpFormatter();
+ myhelp.printHelp("MediaFilterManager\n", options);
+ System.exit(0);
+ }
+
+ //save to a global skip list
+ skipList = Arrays.asList(skipIds);
+ }
+
Context c = null;
try
@@ -372,46 +407,68 @@ public class MediaFilterManager
public static void applyFiltersAllItems(Context c) throws Exception
{
- ItemIterator i = Item.findAll(c);
- while (i.hasNext() && processed < max2Process)
+ if(skipList!=null)
+ {
+ //if a skip-list exists, we need to filter community-by-community
+ //so we can respect what is in the skip-list
+ Community[] topLevelCommunities = Community.findAllTop(c);
+
+ for(int i=0; iAvailable Command-Line Options:
-[dspace]/bin/filter-media -h
+- Help:
[dspace]/bin/filter-media -h
- Display help message describing all command-line options.
-[dspace]/bin/filter-media -f
+- Force mode:
[dspace]/bin/filter-media -f
- - "Force" mode - Apply filters to ALL bitstreams, even if they've already been filtered. If they've already been filtered, the previously filtered content is overwritten.
+ - Apply filters to ALL bitstreams, even if they've already been filtered. If they've already been filtered, the previously filtered content is overwritten.
-[dspace]/bin/filter-media -i 123456789/2
+- Identifier mode:
[dspace]/bin/filter-media -i 123456789/2
- Restrict processing to the community, collection, or item named by the identifier - by default, all bitstreams of all items in the repository are processed. The identifier must be a Handle, not a DB key. This option may be combined with any other option.
-[dspace]/bin/filter-media -m 1000
+- Maximum mode:
[dspace]/bin/filter-media -m 1000
- Suspend operation after the specified maximum number of items have been processed - by default, no limit exists. This option may be combined with any other option.
-[dspace]/bin/filter-media -n
+- No-Index mode:
[dspace]/bin/filter-media -n
- Suppress index creation - by default, a new search index is created for full-text searching. This option suppresses index creation if you intend to run
index-all
elsewhere.
-[dspace]/bin/filter-media -p "PDF Text Extractor","Word Text Extractor"
+- Plugin mode:
[dspace]/bin/filter-media -p "PDF Text Extractor","Word Text Extractor"
- Apply ONLY the filter plugin(s) listed (separated by commas). By default all named filters listed in the
filter.plugins
field of dspace.cfg
are applied. This option may be combined with any other option. WARNING: multiple plugin names must be separated by a comma (i.e. ',') and NOT a comma followed by a space (i.e. ', ').
+
+- Skip mode:
[dspace]/bin/filter-media -s 123456789/9,123456789/100
+
+ - SKIP the listed identifiers (separated by commas) during processing. The identifiers must be Handles (not DB Keys). They may refer to items, collections or communities which should be skipped. This option may be combined with any other option. WARNING: multiple identifiers must be separated by a comma (i.e. ',') and NOT a comma followed by a space (i.e. ', ').
+ - NOTE: If you have a large number of identifiers to skip, you may maintain this comma-separated list within a separate file (e.g.
filter-skiplist.txt
), and call it similar to the following:
+ [dspace]/bin/filter-media -s `less filter-skiplist.txt`
+
+
+
-[dspace]/bin/filter-media -v
+- Verbose mode:
[dspace]/bin/filter-media -v
- Verbose mode - print all extracted text and other filter details to STDOUT.