83661: Port harvest script to scripts and processes

This commit is contained in:
Yana De Pauw
2021-09-21 13:14:14 +02:00
parent ce0c211b57
commit ed29084ccb
4 changed files with 218 additions and 181 deletions

View File

@@ -1,7 +1,7 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
* tree and available oncommandLine at
*
* http://www.dspace.org/license/
*/
@@ -13,11 +13,8 @@ import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Collection;
import org.dspace.content.DSpaceObject;
@@ -36,221 +33,187 @@ import org.dspace.harvest.HarvestingException;
import org.dspace.harvest.OAIHarvester;
import org.dspace.harvest.factory.HarvestServiceFactory;
import org.dspace.harvest.service.HarvestedCollectionService;
import org.dspace.scripts.DSpaceRunnable;
import org.dspace.utils.DSpace;
/**
* Test class for harvested collections.
*
* @author Alexey Maslov
*/
public class Harvest {
private static Context context;
public class Harvest extends DSpaceRunnable<HarvestScriptConfiguration> {
private static final HarvestedCollectionService harvestedCollectionService =
HarvestServiceFactory.getInstance().getHarvestedCollectionService();
private static final EPersonService ePersonService = EPersonServiceFactory.getInstance().getEPersonService();
private static final CollectionService collectionService =
ContentServiceFactory.getInstance().getCollectionService();
private HarvestedCollectionService harvestedCollectionService;
private EPersonService ePersonService;
private CollectionService collectionService;
public static void main(String[] argv) throws Exception {
// create an options object and populate it
CommandLineParser parser = new DefaultParser();
Options options = new Options();
options.addOption("p", "purge", false, "delete all items in the collection");
options.addOption("r", "run", false, "run the standard harvest procedure");
options.addOption("g", "ping", false, "test the OAI server and set");
options.addOption("s", "setup", false, "Set the collection up for harvesting");
options.addOption("S", "start", false, "start the harvest loop");
options.addOption("R", "reset", false, "reset harvest status on all collections");
options.addOption("P", "purge", false, "purge all harvestable collections");
private boolean help;
private String command = null;
private String eperson = null;
private String collection = null;
private String oaiSource = null;
private String oaiSetID = null;
private String metadataKey = null;
private int harvestType = 0;
options.addOption("e", "eperson", true,
"eperson");
options.addOption("c", "collection", true,
"harvesting collection (handle or id)");
options.addOption("t", "type", true,
"type of harvesting (0 for none)");
options.addOption("a", "address", true,
"address of the OAI-PMH server");
options.addOption("i", "oai_set_id", true,
"id of the PMH set representing the harvested collection");
options.addOption("m", "metadata_format", true,
"the name of the desired metadata format for harvesting, resolved to namespace and " +
"crosswalk in dspace.cfg");
public HarvestScriptConfiguration getScriptConfiguration() {
return new DSpace().getServiceManager()
.getServiceByName("harvest", HarvestScriptConfiguration.class);
}
options.addOption("h", "help", false, "help");
public void setup() throws ParseException {
harvestedCollectionService =
HarvestServiceFactory.getInstance().getHarvestedCollectionService();
ePersonService = EPersonServiceFactory.getInstance().getEPersonService();
collectionService =
ContentServiceFactory.getInstance().getCollectionService();
CommandLine line = parser.parse(options, argv);
String command = null;
String eperson = null;
String collection = null;
String oaiSource = null;
String oaiSetID = null;
String metadataKey = null;
int harvestType = 0;
if (line.hasOption('h')) {
HelpFormatter myhelp = new HelpFormatter();
myhelp.printHelp("Harvest\n", options);
System.out.println("\nPING OAI server: Harvest -g -a oai_source -i oai_set_id");
System.out.println(
"SETUP a collection for harvesting: Harvest -s -c collection -t harvest_type -a oai_source -i " +
"oai_set_id -m metadata_format");
System.out.println("RUN harvest once: Harvest -r -e eperson -c collection");
System.out.println("START harvest scheduler: Harvest -S");
System.out.println("RESET all harvest status: Harvest -R");
System.out.println("PURGE a collection of items and settings: Harvest -p -e eperson -c collection");
System.out.println("PURGE all harvestable collections: Harvest -P -e eperson");
help = commandLine.hasOption('h'); //TODO copy the old message?
System.exit(0);
}
if (line.hasOption('s')) {
if (commandLine.hasOption('s')) {
command = "config";
}
if (line.hasOption('p')) {
if (commandLine.hasOption('p')) {
command = "purge";
}
if (line.hasOption('r')) {
if (commandLine.hasOption('r')) {
command = "run";
}
if (line.hasOption('g')) {
if (commandLine.hasOption('g')) {
command = "ping";
}
if (line.hasOption('S')) {
if (commandLine.hasOption('S')) {
command = "start";
}
if (line.hasOption('R')) {
if (commandLine.hasOption('R')) {
command = "reset";
}
if (line.hasOption('P')) {
if (commandLine.hasOption('P')) {
command = "purgeAll";
}
if (line.hasOption('e')) {
eperson = line.getOptionValue('e');
if (commandLine.hasOption('e')) {
eperson = commandLine.getOptionValue('e');
}
if (line.hasOption('c')) {
collection = line.getOptionValue('c');
if (commandLine.hasOption('c')) {
collection = commandLine.getOptionValue('c');
}
if (line.hasOption('t')) {
harvestType = Integer.parseInt(line.getOptionValue('t'));
if (commandLine.hasOption('t')) {
harvestType = Integer.parseInt(commandLine.getOptionValue('t'));
} else {
harvestType = 0;
}
if (line.hasOption('a')) {
oaiSource = line.getOptionValue('a');
if (commandLine.hasOption('a')) {
oaiSource = commandLine.getOptionValue('a');
}
if (line.hasOption('i')) {
oaiSetID = line.getOptionValue('i');
if (commandLine.hasOption('i')) {
oaiSetID = commandLine.getOptionValue('i');
}
if (line.hasOption('m')) {
metadataKey = line.getOptionValue('m');
if (commandLine.hasOption('m')) {
metadataKey = commandLine.getOptionValue('m');
}
}
public void internalRun() throws Exception {
if (help) {
printHelp();
handler.logInfo("PING OAI server: Harvest -g -a oai_source -i oai_set_id");
handler.logInfo(
"SETUP a collection for harvesting: Harvest -s -c collection -t harvest_type -a oai_source -i " +
"oai_set_id -m metadata_format");
handler.logInfo("RUN harvest once: Harvest -r -e eperson -c collection");
handler.logInfo("START harvest scheduler: Harvest -S");
handler.logInfo("RESET all harvest status: Harvest -R");
handler.logInfo("PURGE a collection of items and settings: Harvest -p -e eperson -c collection");
handler.logInfo("PURGE all harvestable collections: Harvest -P -e eperson");
return;
}
Context context = new Context(Context.Mode.BATCH_EDIT);
// Instantiate our class
Harvest harvester = new Harvest();
harvester.context = new Context(Context.Mode.BATCH_EDIT);
// Check our options
if (command == null) {
System.out
.println("Error - no parameters specified (run with -h flag for details)");
System.exit(1);
if (StringUtils.isBlank(command)) {
handler.logError("No parameters specified (run with -h flag for details)");
throw new UnsupportedOperationException("No command specified");
} else if ("run".equals(command)) {
// Run a single harvest cycle on a collection using saved settings.
if (collection == null || eperson == null) {
System.out
.println("Error - a target collection and eperson must be provided");
System.out.println(" (run with -h flag for details)");
System.exit(1);
handler.logError("A target collection and eperson must be provided (run with -h flag for details)");
throw new UnsupportedOperationException("A target collection and eperson must be provided");
}
harvester.runHarvest(collection, eperson);
runHarvest(context, collection, eperson);
} else if ("start".equals(command)) {
// start the harvest loop
startHarvester();
} else if ("reset".equals(command)) {
// reset harvesting status
resetHarvesting();
resetHarvesting(context);
} else if ("purgeAll".equals(command)) {
// purge all collections that are set up for harvesting (obviously for testing purposes only)
if (eperson == null) {
System.out
.println("Error - an eperson must be provided");
System.out.println(" (run with -h flag for details)");
System.exit(1);
handler.logError("An eperson must be provided (run with -h flag for details)");
throw new UnsupportedOperationException("An eperson must be provided");
}
List<HarvestedCollection> harvestedCollections = harvestedCollectionService.findAll(context);
for (HarvestedCollection harvestedCollection : harvestedCollections) {
System.out.println(
"Purging the following collections (deleting items and resetting harvest status): " +
harvestedCollection
.getCollection().getID().toString());
harvester.purgeCollection(harvestedCollection.getCollection().getID().toString(), eperson);
handler.logInfo(
"Purging the following collections (deleting items and resetting harvest status): " +
harvestedCollection
.getCollection().getID().toString());
purgeCollection(context, harvestedCollection.getCollection().getID().toString(), eperson);
}
context.complete();
} else if ("purge".equals(command)) {
// Delete all items in a collection. Useful for testing fresh harvests.
if (collection == null || eperson == null) {
System.out
.println("Error - a target collection and eperson must be provided");
System.out.println(" (run with -h flag for details)");
System.exit(1);
handler.logError("A target collection and eperson must be provided (run with -h flag for details)");
throw new UnsupportedOperationException("A target collection and eperson must be provided");
}
harvester.purgeCollection(collection, eperson);
purgeCollection(context, collection, eperson);
context.complete();
//TODO: implement this... remove all items and remember to unset "last-harvested" settings
} else if ("config".equals(command)) {
// Configure a collection with the three main settings
if (collection == null) {
System.out.println("Error - a target collection must be provided");
System.out.println(" (run with -h flag for details)");
System.exit(1);
handler.logError("A target collection must be provided (run with -h flag for details)");
throw new UnsupportedOperationException("A target collection must be provided");
}
if (oaiSource == null || oaiSetID == null) {
System.out.println("Error - both the OAI server address and OAI set id must be specified");
System.out.println(" (run with -h flag for details)");
System.exit(1);
handler.logError("Both the OAI server address and OAI set id must be specified (run with -h flag for details)");
throw new UnsupportedOperationException("Both the OAI server address and OAI set id must be specified");
}
if (metadataKey == null) {
System.out
.println("Error - a metadata key (commonly the prefix) must be specified for this collection");
System.out.println(" (run with -h flag for details)");
System.exit(1);
handler.logError("A metadata key (commonly the prefix) must be specified for this collection (run with -h flag for details)");
throw new UnsupportedOperationException("A metadata key (commonly the prefix) must be specified for this collection");
}
harvester.configureCollection(collection, harvestType, oaiSource, oaiSetID, metadataKey);
configureCollection(context, collection, harvestType, oaiSource, oaiSetID, metadataKey);
} else if ("ping".equals(command)) {
if (oaiSource == null || oaiSetID == null) {
System.out.println("Error - both the OAI server address and OAI set id must be specified");
System.out.println(" (run with -h flag for details)");
System.exit(1);
handler.logError("Both the OAI server address and OAI set id must be specified (run with -h flag for details)");
throw new UnsupportedOperationException("Both the OAI server address and OAI set id must be specified");
}
pingResponder(oaiSource, oaiSetID, metadataKey);
} else {
System.out.println("Error - your command '" + command + "' was not recoginzed properly");
System.out.println(" (run with -h flag for details)");
System.exit(1);
handler.logError("Your command '" + command + "' was not recoginzed properly (run with -h flag for details)");
throw new UnsupportedOperationException("\"Your command '\" + command + \"' was not recoginzed properly\"");
}
}
/*
* Resolve the ID into a collection and check to see if its harvesting options are set. If so, return
* the collection, if not, bail out.
*/
private Collection resolveCollection(String collectionID) {
private Collection resolveCollection(Context context, String collectionID) {
DSpaceObject dso;
Collection targetCollection = null;
@@ -270,14 +233,14 @@ public class Harvest {
}
} else {
// not a handle, try and treat it as an collection database UUID
System.out.println("Looking up by UUID: " + collectionID + ", " + "in context: " + context);
handler.logInfo("Looking up by UUID: " + collectionID + ", " + "in context: " + context);
targetCollection = collectionService.find(context, UUID.fromString(collectionID));
}
}
// was the collection valid?
if (targetCollection == null) {
System.out.println("Cannot resolve " + collectionID + " to collection");
System.exit(1);
handler.logError("Cannot resolve " + collectionID + " to collection");
throw new UnsupportedOperationException("Cannot resolve " + collectionID + " to collection");
}
} catch (SQLException se) {
se.printStackTrace();
@@ -287,12 +250,12 @@ public class Harvest {
}
private void configureCollection(String collectionID, int type, String oaiSource, String oaiSetId,
private void configureCollection(Context context, String collectionID, int type, String oaiSource, String oaiSetId,
String mdConfigId) {
System.out.println("Running: configure collection");
handler.logInfo("Running: configure collection");
Collection collection = resolveCollection(collectionID);
System.out.println(collection.getID());
Collection collection = resolveCollection(context, collectionID);
handler.logInfo(String.valueOf(collection.getID()));
try {
HarvestedCollection hc = harvestedCollectionService.find(context, collection);
@@ -307,9 +270,8 @@ public class Harvest {
context.restoreAuthSystemState();
context.complete();
} catch (Exception e) {
System.out.println("Changes could not be committed");
e.printStackTrace();
System.exit(1);
handler.logError("Changes could not be committed");
handler.handleException(e);
} finally {
if (context != null) {
context.restoreAuthSystemState();
@@ -324,10 +286,10 @@ public class Harvest {
* @param collectionID
* @param email
*/
private void purgeCollection(String collectionID, String email) {
System.out.println(
"Purging collection of all items and resetting last_harvested and harvest_message: " + collectionID);
Collection collection = resolveCollection(collectionID);
private void purgeCollection(Context context, String collectionID, String email) {
handler.logInfo(
"Purging collection of all items and resetting last_harvested and harvest_message: " + collectionID);
Collection collection = resolveCollection(context, collectionID);
try {
EPerson eperson = ePersonService.findByEmail(context, email);
@@ -340,7 +302,7 @@ public class Harvest {
while (it.hasNext()) {
i++;
Item item = it.next();
System.out.println("Deleting: " + item.getHandle());
handler.logInfo("Deleting: " + item.getHandle());
collectionService.removeItem(context, collection, item);
context.uncacheEntity(item);// Dispatch events every 50 items
if (i % 50 == 0) {
@@ -360,9 +322,8 @@ public class Harvest {
context.restoreAuthSystemState();
context.dispatchEvents();
} catch (Exception e) {
System.out.println("Changes could not be committed");
e.printStackTrace();
System.exit(1);
handler.logError("Changes could not be committed");
handler.handleException(e);
} finally {
context.restoreAuthSystemState();
}
@@ -372,30 +333,28 @@ public class Harvest {
/**
* Run a single harvest cycle on the specified collection under the authorization of the supplied EPerson
*/
private void runHarvest(String collectionID, String email) {
System.out.println("Running: a harvest cycle on " + collectionID);
private void runHarvest(Context context, String collectionID, String email) {
handler.logInfo("Running: a harvest cycle on " + collectionID);
System.out.print("Initializing the harvester... ");
handler.logInfo("Initializing the harvester... ");
OAIHarvester harvester = null;
try {
Collection collection = resolveCollection(collectionID);
Collection collection = resolveCollection(context, collectionID);
HarvestedCollection hc = harvestedCollectionService.find(context, collection);
harvester = new OAIHarvester(context, collection, hc);
System.out.println("success. ");
handler.logInfo("Initialized the harvester successfully");
} catch (HarvestingException hex) {
System.out.print("failed. ");
System.out.println(hex.getMessage());
handler.logError("Initializing the harvester failed.");
throw new IllegalStateException("Unable to harvest", hex);
} catch (SQLException se) {
System.out.print("failed. ");
System.out.println(se.getMessage());
handler.logError("Initializing the harvester failed.");
throw new IllegalStateException("Unable to access database", se);
}
try {
// Harvest will not work for an anonymous user
EPerson eperson = ePersonService.findByEmail(context, email);
System.out.println("Harvest started... ");
handler.logInfo("Harvest started... ");
context.setCurrentUser(eperson);
harvester.runHarvest();
context.complete();
@@ -403,15 +362,15 @@ public class Harvest {
throw new IllegalStateException("Failed to run harvester", e);
}
System.out.println("Harvest complete. ");
handler.logInfo("Harvest complete. ");
}
/**
* Resets harvest_status and harvest_start_time flags for all collections that have a row in the
* harvested_collections table
*/
private static void resetHarvesting() {
System.out.print("Resetting harvest status flag on all collections... ");
private void resetHarvesting(Context context) {
handler.logInfo("Resetting harvest status flag on all collections... ");
try {
List<HarvestedCollection> harvestedCollections = harvestedCollectionService.findAll(context);
@@ -421,21 +380,21 @@ public class Harvest {
harvestedCollection.setHarvestStatus(HarvestedCollection.STATUS_READY);
harvestedCollectionService.update(context, harvestedCollection);
}
System.out.println("success. ");
handler.logInfo("Reset harvest status flag successfully");
} catch (Exception ex) {
System.out.println("failed. ");
ex.printStackTrace();
handler.logError("Resetting harvest status flag failed");
handler.handleException(ex);
}
}
/**
* Starts up the harvest scheduler. Terminating this process will stop the scheduler.
*/
private static void startHarvester() {
private void startHarvester() {
try {
System.out.print("Starting harvest loop... ");
handler.logInfo("Starting harvest loop... ");
HarvestServiceFactory.getInstance().getHarvestSchedulingService().startNewScheduler();
System.out.println("running. ");
handler.logInfo("running. ");
} catch (Exception ex) {
ex.printStackTrace();
}
@@ -448,29 +407,31 @@ public class Harvest {
* @param set name of an item set.
* @param metadataFormat local prefix name, or null for "dc".
*/
private static void pingResponder(String server, String set, String metadataFormat) {
private void pingResponder(String server, String set, String metadataFormat) {
List<String> errors;
System.out.print("Testing basic PMH access: ");
handler.logInfo("Testing basic PMH access: ");
errors = harvestedCollectionService.verifyOAIharvester(server, set,
(null != metadataFormat) ? metadataFormat : "dc", false);
(null != metadataFormat) ? metadataFormat : "dc", false);
if (errors.isEmpty()) {
System.out.println("OK");
handler.logInfo("OK");
} else {
for (String error : errors) {
System.err.println(error);
handler.logError(error);
}
}
System.out.print("Testing ORE support: ");
handler.logInfo("Testing ORE support: ");
errors = harvestedCollectionService.verifyOAIharvester(server, set,
(null != metadataFormat) ? metadataFormat : "dc", true);
(null != metadataFormat) ? metadataFormat : "dc", true);
if (errors.isEmpty()) {
System.out.println("OK");
handler.logInfo("OK");
} else {
for (String error : errors) {
System.err.println(error);
handler.logError(error);
}
}
}
}

View File

@@ -0,0 +1,71 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.harvest;
import java.sql.SQLException;
import org.apache.commons.cli.Options;
import org.dspace.authorize.service.AuthorizeService;
import org.dspace.core.Context;
import org.dspace.scripts.configuration.ScriptConfiguration;
import org.springframework.beans.factory.annotation.Autowired;
public class HarvestScriptConfiguration<T extends Harvest> extends ScriptConfiguration<T> {
@Autowired
private AuthorizeService authorizeService;
private Class<T> dspaceRunnableClass;
@Override
public Class<T> getDspaceRunnableClass() {
return dspaceRunnableClass;
}
@Override
public void setDspaceRunnableClass(Class<T> dspaceRunnableClass) {
this.dspaceRunnableClass = dspaceRunnableClass;
}
public boolean isAllowedToExecute(final Context context) {
try {
return authorizeService.isAdmin(context);
} catch (SQLException e) {
throw new RuntimeException("SQLException occurred when checking if the current user is an admin", e);
}
}
public Options getOptions() {
Options options = new Options();
options.addOption("p", "purge", false, "delete all items in the collection");
options.addOption("r", "run", false, "run the standard harvest procedure");
options.addOption("g", "ping", false, "test the OAI server and set");
options.addOption("s", "setup", false, "Set the collection up for harvesting");
options.addOption("S", "start", false, "start the harvest loop");
options.addOption("R", "reset", false, "reset harvest status on all collections");
options.addOption("P", "purge", false, "purge all harvestable collections");
options.addOption("e", "eperson", true,
"eperson");
options.addOption("c", "collection", true,
"harvesting collection (handle or id)");
options.addOption("t", "type", true,
"type of harvesting (0 for none)");
options.addOption("a", "address", true,
"address of the OAI-PMH server");
options.addOption("i", "oai_set_id", true,
"id of the PMH set representing the harvested collection");
options.addOption("m", "metadata_format", true,
"the name of the desired metadata format for harvesting, resolved to namespace and " +
"crosswalk in dspace.cfg");
options.addOption("h", "help", false, "help");
return options;
}
}

View File

@@ -207,8 +207,8 @@ public class RestDSpaceRunnableHandler implements DSpaceRunnableHandler {
HelpFormatter formatter = new HelpFormatter();
StringWriter out = new StringWriter();
PrintWriter pw = new PrintWriter(out);
formatter.printUsage(pw, 1000, name, options);
formatter.printHelp(pw, 1000, name, null, options, formatter.getLeftPadding(), formatter.getDescPadding(),
null, false);
pw.flush();
String helpString = out.toString();

View File

@@ -40,4 +40,9 @@
<property name="description" value="Delete all the values of the specified metadata field"/>
<property name="dspaceRunnableClass" value="org.dspace.app.bulkedit.MetadataDeletionCli"/>
</bean>
<bean id="harvest" class="org.dspace.app.harvest.HarvestScriptConfiguration">
<property name="description" value="Manage the OAI-PMH harvesting of external collections"/>
<property name="dspaceRunnableClass" value="org.dspace.app.harvest.Harvest"/>
</bean>
</beans>