[DS-1002] Additional simple curation tasks to ship with DSpace release (org.dspace.ctask.general.*)

git-svn-id: http://scm.dspace.org/svn/repo/dspace/trunk@6571 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
Kim Shepherd
2011-08-19 04:26:02 +00:00
parent baf819e17b
commit 8b0fab90ff
8 changed files with 687 additions and 2 deletions

View File

@@ -0,0 +1,212 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.ctask.general;
import org.apache.log4j.Logger;
import org.dspace.content.DCValue;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.core.ConfigurationManager;
import org.dspace.curate.AbstractCurationTask;
import org.dspace.curate.Curator;
import org.dspace.curate.Distributive;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* MicrosoftTranslator translates stuff
*
* @author Kim Shepherd
*/
@Distributive
public abstract class AbstractTranslator extends AbstractCurationTask
{
int status = Curator.CURATE_UNSET;
private static final String PLUGIN_PREFIX = "translator";
private static String authLangField = "dc.language";
private static String authLang = "en";
private static String[] toTranslate;
private static String[] langs;
private static String apiKey = "";
private static Logger log = Logger.getLogger(AbstractTranslator.class);
private List<String> results = new ArrayList<String>();
@Override
public void init(Curator curator, String taskId) throws IOException
{
super.init(curator, taskId);
// Load configuration
authLang = ConfigurationManager.getProperty("default.locale");
authLangField = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.field.language");
String toTranslateStr = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.field.targets");
String langsStr = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.language.targets");
toTranslate = toTranslateStr.split(",");
langs = langsStr.split(",");
if(!(toTranslate.length > 0 && langs.length > 0))
{
status = Curator.CURATE_ERROR;
results.add("Configuration error");
setResult(results.toString());
report(results.toString());
return;
}
initApi();
}
@Override
public int perform(DSpaceObject dso) throws IOException
{
if(dso instanceof Item)
{
Item item = (Item) dso;
/*
* We lazily set success here because our success or failure
* is per-field, not per-item
*/
status = Curator.CURATE_SUCCESS;
String handle = item.getHandle();
log.debug("Translating metadata for " + handle);
DCValue[] authLangs = item.getMetadata(authLangField);
if(authLangs.length > 0)
{
/* Assume the first... multiple
"authoritative" languages won't work */
authLang = authLangs[0].value;
log.debug("Authoritative language for " + handle + " is " + authLang);
}
for(String lang : langs)
{
lang = lang.trim();
for(String field : toTranslate)
{
boolean translated = false;
field = field.trim();
String[] fieldSegments = field.split("\\.");
DCValue[] fieldMetadata = null;
if(fieldSegments.length > 2) {
// First, check to see if we've already got this in the target language
DCValue[] checkMetadata = item.getMetadata(fieldSegments[0], fieldSegments[1], fieldSegments[2], lang);
if(checkMetadata.length > 0)
{
// We've already translated this, move along
log.debug(handle + "already has " + field + " in " + lang + ", skipping");
results.add(handle + ": Skipping " + lang + " translation " + "(" + field + ")");
translated = true;
}
// Let's carry on and get the authoritative version, then
fieldMetadata = item.getMetadata(fieldSegments[0], fieldSegments[1], fieldSegments[2], authLang);
}
else {
// First, check to see if we've already got this in the target language
DCValue[] checkMetadata = item.getMetadata(fieldSegments[0], fieldSegments[1], null, lang);
if(checkMetadata.length > 0)
{
// We've already translated this, move along
log.debug(handle + "already has " + field + " in " + lang + ", skipping");
results.add(handle + ": Skipping " + lang + " translation " + "(" + field + ")");
translated = true;
}
// Let's carry on and get the authoritative version, then
fieldMetadata = item.getMetadata(fieldSegments[0], fieldSegments[1], null, authLang);
}
if(!translated && fieldMetadata.length > 0)
{
for(DCValue metadataValue : fieldMetadata) {
String value = metadataValue.value;
String translatedText = translateText(authLang, lang, value);
if(translatedText != null && !"".equals(translatedText))
{
// Add the new metadata
if(fieldSegments.length > 2) {
item.addMetadata(fieldSegments[0], fieldSegments[1], fieldSegments[2], lang, translatedText);
}
else {
item.addMetadata(fieldSegments[0], fieldSegments[1], null, lang, translatedText);
}
try {
item.update();
results.add(handle + ": Translated " + authLang + " -> " + lang + " (" + field + ")");
}
catch(Exception e) {
log.info(e.getLocalizedMessage());
status = Curator.CURATE_ERROR;
}
}
else {
results.add(handle + ": Failed translation of " + authLang + " -> " + lang + "(" + field + ")");
}
}
}
}
}
}
processResults();
return status;
}
protected void initApi() {
/*
* Override this method in your translator
* Only needed to set key, etc.
* apiKey = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.api.key.[service]");
*
*/
}
protected String translateText(String from, String to, String text) throws IOException {
// Override this method in your translator
return null;
}
private void processResults() throws IOException
{
StringBuilder sb = new StringBuilder();
sb.append("Translation report: \n----------------\n");
for(String result : results)
{
sb.append(result).append("\n");
}
setResult(sb.toString());
report(sb.toString());
}
}

View File

@@ -0,0 +1,167 @@
package org.dspace.ctask.general;
import org.apache.log4j.Logger;
import org.dspace.content.DCValue;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.curate.AbstractCurationTask;
import org.dspace.curate.Curator;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* A basic link checker that is designed to be extended. By default this link checker
* will check that all links stored in anyschema.anyelement.uri metadata fields return
* a 20x status code.
*
* This link checker can be enhanced by extending this class, and overriding the
* getURLs and checkURL methods.
*
* @author Stuart Lewis
*/
public class BasicLinkChecker extends AbstractCurationTask
{
// The status of the link checking of this item
private int status = Curator.CURATE_UNSET;
// The results of link checking this item
private List<String> results = null;
// The log4j logger for this class
private static Logger log = Logger.getLogger(BasicLinkChecker.class);
/**
* Perform the link checking.
*
* @param dso The DSpaaceObject to be checked
* @return The curation task status of the checking
* @throws java.io.IOException THrown if something went wrong
*/
@Override
public int perform(DSpaceObject dso) throws IOException
{
// The results that we'll return
StringBuilder results = new StringBuilder();
// Unless this is an item, we'll skip this item
status = Curator.CURATE_SKIP;
if (dso instanceof Item)
{
Item item = (Item)dso;
// Get the URLs
List<String> urls = getURLs(item);
// Assume skip until we hit a URL to check
status = Curator.CURATE_SKIP;
results.append("Item: ").append(getItemHandle(item)).append("\n");
// Check the URLs
for (String url : urls)
{
boolean ok = checkURL(url, results);
if(ok)
{
status = Curator.CURATE_SUCCESS;
}
else
{
status = Curator.CURATE_FAIL;
}
}
}
setResult(results.toString());
report(results.toString());
return status;
}
/**
* Get the URLs to check
*
* @param item The item to extract URLs from
* @return An array of URL Strings
*/
protected List<String> getURLs(Item item)
{
// Get URIs from anyschema.anyelement.uri.*
DCValue[] urls = item.getMetadata(Item.ANY, Item.ANY, "uri", Item.ANY);
ArrayList<String> theURLs = new ArrayList<String>();
for (DCValue url : urls)
{
theURLs.add(url.value);
}
return theURLs;
}
/**
* Check the URL and perform appropriate reporting
*
* @param url The URL to check
* @return If the URL was OK or not
*/
protected boolean checkURL(String url, StringBuilder results)
{
// Link check the URL
int httpStatus = getResponseStatus(url);
if ((httpStatus >= 200) && (httpStatus < 300))
{
results.append(" - " + url + " = " + httpStatus + " - OK\n");
return true;
}
else
{
results.append(" - " + url + " = " + httpStatus + " - FAILED\n");
return false;
}
}
/**
* Get the response code for a URL. If something goes wrong opening the URL, a
* response code of 0 is returned.
*
* @param url The url to open
* @return The HTTP response code (e.g. 200 / 301 / 404 / 500)
*/
protected int getResponseStatus(String url)
{
try
{
URL theURL = new URL(url);
HttpURLConnection connection = (HttpURLConnection)theURL.openConnection();
int code = connection.getResponseCode();
connection.disconnect();
return code;
} catch (IOException ioe)
{
// Must be a bad URL
log.debug("Bad link: " + ioe.getMessage());
return 0;
}
}
/**
* Internal utitity method to get a description of the handle
*
* @param item The item to get a description of
* @return The handle, or in workflow
*/
private static String getItemHandle(Item item)
{
String handle = item.getHandle();
return (handle != null) ? handle: " in workflow";
}
}

View File

@@ -0,0 +1,125 @@
package org.dspace.ctask.general;
import org.apache.log4j.Logger;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.Bundle;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.curate.AbstractCurationTask;
import org.dspace.curate.Curator;
import java.sql.SQLException;
import java.util.List;
/**
* A curation job to take bitstream URLs and place them into metadata elements.
*
* @author Stuart Lewis
*/
public class BitstreamsIntoMetadata extends AbstractCurationTask
{
// The status of this item
private int status = Curator.CURATE_UNSET;
// The results of processing this
private List<String> results = null;
// The log4j logger for this class
private static Logger log = Logger.getLogger(BitstreamsIntoMetadata.class);
/**
* Perform the bitstream metadata creation.
*
* @param dso The DSpaceObject to be checked
* @return The curation task status of the checking
*/
@Override
public int perform(DSpaceObject dso)
{
// The results that we'll return
StringBuilder results = new StringBuilder();
// Unless this is an item, we'll skip this item
status = Curator.CURATE_SKIP;
boolean changed = false;
logDebugMessage("The target dso is " + dso.getName());
if (dso instanceof Item)
{
try {
Item item = (Item)dso;
item.clearMetadata("dc", "format", Item.ANY, Item.ANY);
for (Bundle bundle : item.getBundles()) {
if ("ORIGINAL".equals(bundle.getName())) {
for (Bitstream bitstream : bundle.getBitstreams()) {
// Add the metadata and update the item
addMetadata(item, bitstream, "original");
changed = true;
}
} else if ("THUMBNAIL".equals(bundle.getName())) {
for (Bitstream bitstream : bundle.getBitstreams()) {
// Add the metadata and update the item
addMetadata(item, bitstream, "thumbnail");
changed = true;
}
}
if (changed) {
item.update();
status = Curator.CURATE_SUCCESS;
}
}
} catch (AuthorizeException ae) {
// Something went wrong
logDebugMessage(ae.getMessage());
status = Curator.CURATE_ERROR;
} catch (SQLException sqle) {
// Something went wrong
logDebugMessage(sqle.getMessage());
status = Curator.CURATE_ERROR;
}
}
logDebugMessage("About to report: " + results.toString());
setResult(results.toString());
report(results.toString());
return status;
}
/**
* Debugging logging if required
*
* @param message The message to log
*/
private void logDebugMessage(String message)
{
if (log.isDebugEnabled())
{
log.debug(message);
}
}
/**
* Add the bitstream metadata to the item
*
* @param item The item
* @param bitstream The bitstream
* @param type The type of bitstream
*/
private void addMetadata(Item item, Bitstream bitstream, String type) {
String value = bitstream.getFormat().getMIMEType() + "##";
value += bitstream.getName() + "##";
value += bitstream.getSize() + "##";
value += item.getHandle() + "##";
value += bitstream.getSequenceID() + "##";
value += bitstream.getChecksum() + "##";
if (bitstream.getDescription() != null) {
value += bitstream.getDescription();
}
item.addMetadata("dc", "format", type, "en", value);
}
}

View File

@@ -0,0 +1,33 @@
package org.dspace.ctask.general;
import org.dspace.content.DCValue;
import org.dspace.content.Item;
import java.util.ArrayList;
import java.util.List;
/**
* A link checker that builds upon the BasicLinkChecker to check URLs that
* appear in all metadata fields where the field starts with http:// or https://
*
* Of course thi assumes that there is no extra metadata following the URL.
*
* @author Stuart Lewis
*/
public class MetadataValueLinkChecker extends BasicLinkChecker {
protected List<String> getURLs(Item item)
{
// Get all metadata elements that start with http:// or https://
DCValue[] urls = item.getMetadata(Item.ANY, Item.ANY, Item.ANY, Item.ANY);
ArrayList<String> theURLs = new ArrayList<String>();
for (DCValue url : urls)
{
if ((url.value.startsWith("http://")) || (url.value.startsWith("https://")))
{
theURLs.add(url.value);
}
}
return theURLs;
}
}

View File

@@ -0,0 +1,75 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.ctask.general;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.log4j.Logger;
import org.dspace.core.ConfigurationManager;
import java.io.IOException;
import java.net.URLEncoder;
/**
* MicrosoftTranslator translates metadata fields using Microsoft Translation API v2
*
* Requirements: A valid Bing App ID/Key
* More information: http://www.bing.com/developers
*
* This key, and other custom configuration, goes in [dspace]/modules/translator.cfg
*
* @author Kim Shepherd
*/
public class MicrosoftTranslator extends AbstractTranslator
{
private static final String PLUGIN_PREFIX = "translator";
private static final String baseUrl = "http://api.microsofttranslator.com/V2/Http.svc/Translate";
private static String apiKey = "";
private static Logger log = Logger.getLogger(MicrosoftTranslator.class);
@Override
protected void initApi() {
apiKey = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.api.key.microsoft");
}
@Override
protected String translateText(String from, String to, String text) throws IOException {
log.debug("Performing API call to translate from " + from + " to " + to);
text = URLEncoder.encode(text, "UTF-8");
String translatedText = null;
String url = baseUrl + "?appId=" + apiKey;
url += "&to=" + to + "&from=" + from + "&text=" + text;
HttpClient client = new HttpClient();
HttpMethod hm = new GetMethod(url);
int code = client.executeMethod(hm);
log.debug("Response code from API call is " + code);
if(code == 200) {
String response = hm.getResponseBodyAsString();
response = response.replaceAll("<string xmlns=\"http://schemas.microsoft.com/2003/10/Serialization/\">","");
response = response.replaceAll("</string>","");
translatedText = response;
}
return translatedText;
}
}

View File

@@ -0,0 +1,34 @@
package org.dspace.ctask.general;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.curate.AbstractCurationTask;
import org.dspace.curate.Curator;
import java.io.IOException;
public class NoOpCurationTask extends AbstractCurationTask
{
private int status = Curator.CURATE_UNSET;
private String result = null;
@Override
public int perform(DSpaceObject dso) throws IOException
{
if (dso instanceof Item)
{
Item item = (Item)dso;
status = Curator.CURATE_SUCCESS;
result = "No operation performed on " + item.getHandle();
setResult(result);
report(result);
}
return status;
}
}

View File

@@ -7,9 +7,12 @@
### Task Class implementations
plugin.named.org.dspace.curate.CurationTask = \
org.dspace.ctask.general.NoOpCurationTask = noop, \
org.dspace.ctask.general.ProfileFormats = profileformats, \
org.dspace.ctask.general.RequiredMetadata = requiredmetadata, \
org.dspace.ctask.general.ClamScan = vscan
org.dspace.ctask.general.ClamScan = vscan, \
org.dspace.ctask.general.MicrosoftTranslator = translate, \
org.dspace.ctask.general.MetadataValueLinkChecker = checklinks
# add new tasks here
## task queue implementation
@@ -23,7 +26,8 @@ taskqueue.dir = ${dspace.dir}/ctqueues
# be invoked on cmd line, etc - just not in UI
ui.tasknames = \
profileformats = Profile Bitstream Formats, \
requiredmetadata = Check for Required Metadata
requiredmetadata = Check for Required Metadata, \
checklinks = Check Links in Metadata
# Name of queue used when tasks queued in Admin UI
ui.queuename = admin_ui

View File

@@ -0,0 +1,35 @@
## Translation field settings
##
## Authoritative language field
## This will be read to determine the original language an item was submitted in
## Default: dc.language
translate.field.language = dc.language
## Metadata fields you wish to have translated
#
translate.field.targets = dc.description.abstract, dc.title, dc.type
## Translation language settings
##
## If the language field configured in translate.field.language is not present
## in the record, set translate.language.default to a default source language
## or leave blank to use autodetection
#
translate.language.default = en
## Target languages for translation
#
translate.language.targets = de, fr
## Translation API settings
##
## Your Bing API v2 key and/or Google "Simple API Access" Key
## (note to Google users: your v1 API key will not work with Translate v2,
## you will need to visit https://code.google.com/apis/console and activate
## a Simple API Access key)
##
## You do not need to enter a key for both services.
#
translate.api.key.microsoft = YOUR_MICROSOFT_API_KEY_GOES_HERE
translate.api.key.google = YOUR_GOOGLE_API_KEY_GOES_HERE