Small addition of a "Skip" flag to MediaFilterManager/filter-media. Allows one to specific a list of identifiers (communities/collections/items) to SKIP during filtering process. Updated documentation to explain this new flag.

git-svn-id: http://scm.dspace.org/svn/repo/branches/dspace-1_5_x@2410 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
Tim Donohue
2007-11-30 18:14:41 +00:00
parent 6e377ffd94
commit e891212fdf
2 changed files with 120 additions and 33 deletions

View File

@@ -113,6 +113,8 @@ public class MediaFilterManager
private static Map filterFormats = new HashMap();
private static List skipList = null; //list of identifiers to skip during processing
//separator in filterFormats Map between a filter class name and a plugin name,
//for MediaFilters which extend SelfNamedPlugin (\034 is "file separator" char)
public static String FILTER_PLUGIN_SEPARATOR = "\034";
@@ -153,6 +155,17 @@ public class MediaFilterManager
pluginOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args
options.addOption(pluginOption);
//create a "skip" option (to specify communities/collections/items to skip)
OptionBuilder.withLongOpt("skip");
OptionBuilder.withValueSeparator(',');
OptionBuilder.withDescription(
"SKIP the bitstreams belonging to identifier\n" +
"Separate multiple identifiers with a comma (,)\n" +
"(e.g. MediaFilterManager -s \n 123456789/34,123456789/323)");
Option skipOption = OptionBuilder.create('s');
skipOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args
options.addOption(skipOption);
CommandLine line = null;
try
{
@@ -309,6 +322,28 @@ public class MediaFilterManager
//store our filter list into an internal array
filterClasses = (FormatFilter[]) filterList.toArray(new FormatFilter[filterList.size()]);
//Retrieve list of identifiers to skip (if any)
String skipIds[] = null;
if(line.hasOption('s'))
{
//specified which identifiers to skip when processing
skipIds = line.getOptionValues('s');
if(skipIds==null || skipIds.length==0)
{ //display error, since no identifiers specified to skip
System.err.println("\nERROR: -s (-skip) option requires at least one identifier to SKIP.\n" +
"Make sure to separate multiple identifiers with a comma!\n" +
"(e.g. MediaFilterManager -s 123456789/34,123456789/323)\n");
HelpFormatter myhelp = new HelpFormatter();
myhelp.printHelp("MediaFilterManager\n", options);
System.exit(0);
}
//save to a global skip list
skipList = Arrays.asList(skipIds);
}
Context c = null;
try
@@ -372,15 +407,30 @@ public class MediaFilterManager
public static void applyFiltersAllItems(Context c) throws Exception
{
if(skipList!=null)
{
//if a skip-list exists, we need to filter community-by-community
//so we can respect what is in the skip-list
Community[] topLevelCommunities = Community.findAllTop(c);
for(int i=0; i<topLevelCommunities.length; i++)
applyFiltersCommunity(c, topLevelCommunities[i]);
}
else
{
//otherwise, just find every item and process
ItemIterator i = Item.findAll(c);
while (i.hasNext() && processed < max2Process)
{
applyFiltersItem(c, i.next());
}
}
}
public static void applyFiltersCommunity(Context c, Community community)
throws Exception
{ //only apply filters if community not in skip-list
if(!inSkipList(community.getHandle()))
{
Community[] subcommunities = community.getSubcommunities();
for (int i = 0; i < subcommunities.length; i++)
@@ -394,9 +444,13 @@ public class MediaFilterManager
applyFiltersCollection(c, collections[j]);
}
}
}
public static void applyFiltersCollection(Context c, Collection collection)
throws Exception
{
//only apply filters if collection not in skip-list
if(!inSkipList(collection.getHandle()))
{
ItemIterator i = collection.getItems();
while (i.hasNext() && processed < max2Process)
@@ -404,14 +458,17 @@ public class MediaFilterManager
applyFiltersItem(c, i.next());
}
}
}
public static void applyFiltersItem(Context c, Item item) throws Exception
{
//only apply filters if item not in skip-list
if(!inSkipList(item.getHandle()))
{
//cache this item in MediaFilterManager
//so it can be accessed by MediaFilters as necessary
currentItem = item;
if (filterItem(c, item))
{
// commit changes after each filtered item
@@ -423,6 +480,7 @@ public class MediaFilterManager
item.decache();
currentItem = null;
}
}
/**
* iterate through the item's bitstreams in the ORIGINAL bundle, applying
@@ -638,4 +696,24 @@ public class MediaFilterManager
return currentItem;
}
/**
* Check whether or not to skip processing the given identifier
*
* @param identifier
* identifier (handle) of a community, collection or item
*
* @return true if this community, collection or item should be skipped
* during processing. Otherwise, return false.
*/
public static boolean inSkipList(String identifier)
{
if(skipList!=null && skipList.contains(identifier))
{
System.out.println("SKIP-LIST: skipped bitstreams within identifier " + identifier);
return true;
}
else
return false;
}
}

View File

@@ -966,43 +966,52 @@ dsrun org.dspace.app.itemimport.ItemImport -a -e joe@user.com -c collectionID -
<p><strong>Available Command-Line Options:</strong></p>
<ul>
<li><code>[dspace]/bin/filter-media -h</code>
<li><strong>Help</strong>: <code>[dspace]/bin/filter-media -h</code>
<ul>
<li>Display help message describing all command-line options.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -f</code>
<li><strong>Force mode</strong>: <code>[dspace]/bin/filter-media -f</code>
<ul>
<li>"Force" mode - Apply filters to ALL bitstreams, even if they've already been filtered. If they've already been filtered, the previously filtered content is overwritten.</li>
<li>Apply filters to ALL bitstreams, even if they've already been filtered. If they've already been filtered, the previously filtered content is overwritten.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -i 123456789/2</code>
<li><strong>Identifier mode</strong>: <code>[dspace]/bin/filter-media -i 123456789/2</code>
<ul>
<li>Restrict processing to the community, collection, or item named by the identifier - by default, all bitstreams of all items in the repository are processed. The identifier must be a Handle, not a DB key. This option may be combined with any other option.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -m 1000</code>
<li><strong>Maximum mode</strong>: <code>[dspace]/bin/filter-media -m 1000</code>
<ul>
<li>Suspend operation after the specified maximum number of items have been processed - by default, no limit exists. This option may be combined with any other option.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -n</code>
<li><strong>No-Index mode</strong>: <code>[dspace]/bin/filter-media -n</code>
<ul>
<li>Suppress index creation - by default, a new search index is created for full-text searching. This option suppresses index creation if you intend to run <code>index-all</code> elsewhere.</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -p "PDF Text Extractor","Word Text Extractor"</code>
<li><strong>Plugin mode</strong>: <code>[dspace]/bin/filter-media -p "PDF Text Extractor","Word Text Extractor"</code>
<ul>
<li>Apply ONLY the filter plugin(s) listed (separated by commas). By default all named filters listed in the <code>filter.plugins</code> field of <code>dspace.cfg</code> are applied. This option may be combined with any other option. <em>WARNING:</em> multiple plugin names must be separated by a comma (i.e. ',') and NOT a comma followed by a space (i.e. ', ').</li>
</ul>
</li>
<li><code>[dspace]/bin/filter-media -v</code>
<li><strong>Skip mode</strong>: <code>[dspace]/bin/filter-media -s 123456789/9,123456789/100</code>
<ul>
<li>SKIP the listed identifiers (separated by commas) during processing. The identifiers must be Handles (not DB Keys). They may refer to items, collections or communities which should be skipped. This option may be combined with any other option. <em>WARNING:</em> multiple identifiers must be separated by a comma (i.e. ',') and NOT a comma followed by a space (i.e. ', ').</li>
<li>NOTE: If you have a large number of identifiers to skip, you may maintain this comma-separated list within a separate file (e.g. <code>filter-skiplist.txt</code>), and call it similar to the following:
<ul><li><code>[dspace]/bin/filter-media -s `less filter-skiplist.txt`</code></li></ul>
</li>
</ul>
</li>
<li><strong>Verbose mode</strong>: <code>[dspace]/bin/filter-media -v</code>
<ul>
<li>Verbose mode - print all extracted text and other filter details to STDOUT.</li>
</ul>