mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 01:54:22 +00:00
Small addition of a "Skip" flag to MediaFilterManager/filter-media. Allows one to specific a list of identifiers (communities/collections/items) to SKIP during filtering process. Updated documentation to explain this new flag.
git-svn-id: http://scm.dspace.org/svn/repo/branches/dspace-1_5_x@2410 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
@@ -113,6 +113,8 @@ public class MediaFilterManager
|
||||
|
||||
private static Map filterFormats = new HashMap();
|
||||
|
||||
private static List skipList = null; //list of identifiers to skip during processing
|
||||
|
||||
//separator in filterFormats Map between a filter class name and a plugin name,
|
||||
//for MediaFilters which extend SelfNamedPlugin (\034 is "file separator" char)
|
||||
public static String FILTER_PLUGIN_SEPARATOR = "\034";
|
||||
@@ -151,8 +153,19 @@ public class MediaFilterManager
|
||||
"(e.g. MediaFilterManager -p \n\"Word Text Extractor\",\"PDF Text Extractor\")");
|
||||
Option pluginOption = OptionBuilder.create('p');
|
||||
pluginOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args
|
||||
options.addOption(pluginOption);
|
||||
|
||||
options.addOption(pluginOption);
|
||||
|
||||
//create a "skip" option (to specify communities/collections/items to skip)
|
||||
OptionBuilder.withLongOpt("skip");
|
||||
OptionBuilder.withValueSeparator(',');
|
||||
OptionBuilder.withDescription(
|
||||
"SKIP the bitstreams belonging to identifier\n" +
|
||||
"Separate multiple identifiers with a comma (,)\n" +
|
||||
"(e.g. MediaFilterManager -s \n 123456789/34,123456789/323)");
|
||||
Option skipOption = OptionBuilder.create('s');
|
||||
skipOption.setArgs(Option.UNLIMITED_VALUES); //unlimited number of args
|
||||
options.addOption(skipOption);
|
||||
|
||||
CommandLine line = null;
|
||||
try
|
||||
{
|
||||
@@ -308,7 +321,29 @@ public class MediaFilterManager
|
||||
|
||||
//store our filter list into an internal array
|
||||
filterClasses = (FormatFilter[]) filterList.toArray(new FormatFilter[filterList.size()]);
|
||||
|
||||
|
||||
//Retrieve list of identifiers to skip (if any)
|
||||
String skipIds[] = null;
|
||||
if(line.hasOption('s'))
|
||||
{
|
||||
//specified which identifiers to skip when processing
|
||||
skipIds = line.getOptionValues('s');
|
||||
|
||||
if(skipIds==null || skipIds.length==0)
|
||||
{ //display error, since no identifiers specified to skip
|
||||
System.err.println("\nERROR: -s (-skip) option requires at least one identifier to SKIP.\n" +
|
||||
"Make sure to separate multiple identifiers with a comma!\n" +
|
||||
"(e.g. MediaFilterManager -s 123456789/34,123456789/323)\n");
|
||||
HelpFormatter myhelp = new HelpFormatter();
|
||||
myhelp.printHelp("MediaFilterManager\n", options);
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
//save to a global skip list
|
||||
skipList = Arrays.asList(skipIds);
|
||||
}
|
||||
|
||||
Context c = null;
|
||||
|
||||
try
|
||||
@@ -372,46 +407,68 @@ public class MediaFilterManager
|
||||
|
||||
public static void applyFiltersAllItems(Context c) throws Exception
|
||||
{
|
||||
ItemIterator i = Item.findAll(c);
|
||||
while (i.hasNext() && processed < max2Process)
|
||||
if(skipList!=null)
|
||||
{
|
||||
//if a skip-list exists, we need to filter community-by-community
|
||||
//so we can respect what is in the skip-list
|
||||
Community[] topLevelCommunities = Community.findAllTop(c);
|
||||
|
||||
for(int i=0; i<topLevelCommunities.length; i++)
|
||||
applyFiltersCommunity(c, topLevelCommunities[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
applyFiltersItem(c, i.next());
|
||||
//otherwise, just find every item and process
|
||||
ItemIterator i = Item.findAll(c);
|
||||
while (i.hasNext() && processed < max2Process)
|
||||
{
|
||||
applyFiltersItem(c, i.next());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void applyFiltersCommunity(Context c, Community community)
|
||||
throws Exception
|
||||
{
|
||||
Community[] subcommunities = community.getSubcommunities();
|
||||
for (int i = 0; i < subcommunities.length; i++)
|
||||
{
|
||||
applyFiltersCommunity(c, subcommunities[i]);
|
||||
}
|
||||
|
||||
Collection[] collections = community.getCollections();
|
||||
for (int j = 0; j < collections.length; j++)
|
||||
{
|
||||
applyFiltersCollection(c, collections[j]);
|
||||
}
|
||||
{ //only apply filters if community not in skip-list
|
||||
if(!inSkipList(community.getHandle()))
|
||||
{
|
||||
Community[] subcommunities = community.getSubcommunities();
|
||||
for (int i = 0; i < subcommunities.length; i++)
|
||||
{
|
||||
applyFiltersCommunity(c, subcommunities[i]);
|
||||
}
|
||||
|
||||
Collection[] collections = community.getCollections();
|
||||
for (int j = 0; j < collections.length; j++)
|
||||
{
|
||||
applyFiltersCollection(c, collections[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void applyFiltersCollection(Context c, Collection collection)
|
||||
throws Exception
|
||||
{
|
||||
ItemIterator i = collection.getItems();
|
||||
while (i.hasNext() && processed < max2Process)
|
||||
//only apply filters if collection not in skip-list
|
||||
if(!inSkipList(collection.getHandle()))
|
||||
{
|
||||
applyFiltersItem(c, i.next());
|
||||
ItemIterator i = collection.getItems();
|
||||
while (i.hasNext() && processed < max2Process)
|
||||
{
|
||||
applyFiltersItem(c, i.next());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void applyFiltersItem(Context c, Item item) throws Exception
|
||||
{
|
||||
//cache this item in MediaFilterManager
|
||||
//so it can be accessed by MediaFilters as necessary
|
||||
currentItem = item;
|
||||
|
||||
|
||||
//only apply filters if item not in skip-list
|
||||
if(!inSkipList(item.getHandle()))
|
||||
{
|
||||
//cache this item in MediaFilterManager
|
||||
//so it can be accessed by MediaFilters as necessary
|
||||
currentItem = item;
|
||||
|
||||
if (filterItem(c, item))
|
||||
{
|
||||
// commit changes after each filtered item
|
||||
@@ -422,6 +479,7 @@ public class MediaFilterManager
|
||||
// clear item objects from context cache and internal cache
|
||||
item.decache();
|
||||
currentItem = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -638,4 +696,24 @@ public class MediaFilterManager
|
||||
return currentItem;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether or not to skip processing the given identifier
|
||||
*
|
||||
* @param identifier
|
||||
* identifier (handle) of a community, collection or item
|
||||
*
|
||||
* @return true if this community, collection or item should be skipped
|
||||
* during processing. Otherwise, return false.
|
||||
*/
|
||||
public static boolean inSkipList(String identifier)
|
||||
{
|
||||
if(skipList!=null && skipList.contains(identifier))
|
||||
{
|
||||
System.out.println("SKIP-LIST: skipped bitstreams within identifier " + identifier);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
@@ -966,43 +966,52 @@ dsrun org.dspace.app.itemimport.ItemImport -a -e joe@user.com -c collectionID -
|
||||
<p><strong>Available Command-Line Options:</strong></p>
|
||||
|
||||
<ul>
|
||||
<li><code>[dspace]/bin/filter-media -h</code>
|
||||
<li><strong>Help</strong>: <code>[dspace]/bin/filter-media -h</code>
|
||||
<ul>
|
||||
<li>Display help message describing all command-line options.</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li><code>[dspace]/bin/filter-media -f</code>
|
||||
<li><strong>Force mode</strong>: <code>[dspace]/bin/filter-media -f</code>
|
||||
<ul>
|
||||
<li>"Force" mode - Apply filters to ALL bitstreams, even if they've already been filtered. If they've already been filtered, the previously filtered content is overwritten.</li>
|
||||
<li>Apply filters to ALL bitstreams, even if they've already been filtered. If they've already been filtered, the previously filtered content is overwritten.</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li><code>[dspace]/bin/filter-media -i 123456789/2</code>
|
||||
<li><strong>Identifier mode</strong>: <code>[dspace]/bin/filter-media -i 123456789/2</code>
|
||||
<ul>
|
||||
<li>Restrict processing to the community, collection, or item named by the identifier - by default, all bitstreams of all items in the repository are processed. The identifier must be a Handle, not a DB key. This option may be combined with any other option.</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li><code>[dspace]/bin/filter-media -m 1000</code>
|
||||
<li><strong>Maximum mode</strong>: <code>[dspace]/bin/filter-media -m 1000</code>
|
||||
<ul>
|
||||
<li>Suspend operation after the specified maximum number of items have been processed - by default, no limit exists. This option may be combined with any other option.</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li><code>[dspace]/bin/filter-media -n</code>
|
||||
<li><strong>No-Index mode</strong>: <code>[dspace]/bin/filter-media -n</code>
|
||||
<ul>
|
||||
<li>Suppress index creation - by default, a new search index is created for full-text searching. This option suppresses index creation if you intend to run <code>index-all</code> elsewhere.</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li><code>[dspace]/bin/filter-media -p "PDF Text Extractor","Word Text Extractor"</code>
|
||||
<li><strong>Plugin mode</strong>: <code>[dspace]/bin/filter-media -p "PDF Text Extractor","Word Text Extractor"</code>
|
||||
<ul>
|
||||
<li>Apply ONLY the filter plugin(s) listed (separated by commas). By default all named filters listed in the <code>filter.plugins</code> field of <code>dspace.cfg</code> are applied. This option may be combined with any other option. <em>WARNING:</em> multiple plugin names must be separated by a comma (i.e. ',') and NOT a comma followed by a space (i.e. ', ').</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li><strong>Skip mode</strong>: <code>[dspace]/bin/filter-media -s 123456789/9,123456789/100</code>
|
||||
<ul>
|
||||
<li>SKIP the listed identifiers (separated by commas) during processing. The identifiers must be Handles (not DB Keys). They may refer to items, collections or communities which should be skipped. This option may be combined with any other option. <em>WARNING:</em> multiple identifiers must be separated by a comma (i.e. ',') and NOT a comma followed by a space (i.e. ', ').</li>
|
||||
<li>NOTE: If you have a large number of identifiers to skip, you may maintain this comma-separated list within a separate file (e.g. <code>filter-skiplist.txt</code>), and call it similar to the following:
|
||||
<ul><li><code>[dspace]/bin/filter-media -s `less filter-skiplist.txt`</code></li></ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
<li><code>[dspace]/bin/filter-media -v</code>
|
||||
<li><strong>Verbose mode</strong>: <code>[dspace]/bin/filter-media -v</code>
|
||||
<ul>
|
||||
<li>Verbose mode - print all extracted text and other filter details to STDOUT.</li>
|
||||
</ul>
|
||||
|
Reference in New Issue
Block a user