diff --git a/dspace-api/src/main/java/org/dspace/ctask/general/AbstractTranslator.java b/dspace-api/src/main/java/org/dspace/ctask/general/AbstractTranslator.java new file mode 100644 index 0000000000..1068c64d79 --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/ctask/general/AbstractTranslator.java @@ -0,0 +1,212 @@ +/** + * The contents of this file are subject to the license and copyright + * detailed in the LICENSE and NOTICE files at the root of the source + * tree and available online at + * + * http://www.dspace.org/license/ + */ +package org.dspace.ctask.general; + +import org.apache.log4j.Logger; +import org.dspace.content.DCValue; +import org.dspace.content.DSpaceObject; +import org.dspace.content.Item; +import org.dspace.core.ConfigurationManager; +import org.dspace.curate.AbstractCurationTask; +import org.dspace.curate.Curator; +import org.dspace.curate.Distributive; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * MicrosoftTranslator translates stuff + * + * @author Kim Shepherd + */ +@Distributive +public abstract class AbstractTranslator extends AbstractCurationTask +{ + + int status = Curator.CURATE_UNSET; + + private static final String PLUGIN_PREFIX = "translator"; + private static String authLangField = "dc.language"; + private static String authLang = "en"; + private static String[] toTranslate; + private static String[] langs; + + private static String apiKey = ""; + + private static Logger log = Logger.getLogger(AbstractTranslator.class); + + private List results = new ArrayList(); + + + @Override + public void init(Curator curator, String taskId) throws IOException + { + super.init(curator, taskId); + + // Load configuration + authLang = ConfigurationManager.getProperty("default.locale"); + authLangField = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.field.language"); + String toTranslateStr = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.field.targets"); + String langsStr = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.language.targets"); + toTranslate = toTranslateStr.split(","); + langs = langsStr.split(","); + + if(!(toTranslate.length > 0 && langs.length > 0)) + { + status = Curator.CURATE_ERROR; + results.add("Configuration error"); + setResult(results.toString()); + report(results.toString()); + + return; + } + + initApi(); + + } + + @Override + public int perform(DSpaceObject dso) throws IOException + { + + if(dso instanceof Item) + { + Item item = (Item) dso; + + /* + * We lazily set success here because our success or failure + * is per-field, not per-item + */ + + status = Curator.CURATE_SUCCESS; + + String handle = item.getHandle(); + log.debug("Translating metadata for " + handle); + + DCValue[] authLangs = item.getMetadata(authLangField); + if(authLangs.length > 0) + { + /* Assume the first... multiple + "authoritative" languages won't work */ + authLang = authLangs[0].value; + log.debug("Authoritative language for " + handle + " is " + authLang); + } + + for(String lang : langs) + { + lang = lang.trim(); + + for(String field : toTranslate) + { + boolean translated = false; + field = field.trim(); + String[] fieldSegments = field.split("\\."); + DCValue[] fieldMetadata = null; + + if(fieldSegments.length > 2) { + // First, check to see if we've already got this in the target language + DCValue[] checkMetadata = item.getMetadata(fieldSegments[0], fieldSegments[1], fieldSegments[2], lang); + if(checkMetadata.length > 0) + { + // We've already translated this, move along + log.debug(handle + "already has " + field + " in " + lang + ", skipping"); + results.add(handle + ": Skipping " + lang + " translation " + "(" + field + ")"); + translated = true; + } + + // Let's carry on and get the authoritative version, then + fieldMetadata = item.getMetadata(fieldSegments[0], fieldSegments[1], fieldSegments[2], authLang); + + } + else { + // First, check to see if we've already got this in the target language + DCValue[] checkMetadata = item.getMetadata(fieldSegments[0], fieldSegments[1], null, lang); + if(checkMetadata.length > 0) + { + // We've already translated this, move along + log.debug(handle + "already has " + field + " in " + lang + ", skipping"); + results.add(handle + ": Skipping " + lang + " translation " + "(" + field + ")"); + translated = true; + } + + // Let's carry on and get the authoritative version, then + fieldMetadata = item.getMetadata(fieldSegments[0], fieldSegments[1], null, authLang); + + + } + + if(!translated && fieldMetadata.length > 0) + { + for(DCValue metadataValue : fieldMetadata) { + String value = metadataValue.value; + String translatedText = translateText(authLang, lang, value); + if(translatedText != null && !"".equals(translatedText)) + { + // Add the new metadata + if(fieldSegments.length > 2) { + item.addMetadata(fieldSegments[0], fieldSegments[1], fieldSegments[2], lang, translatedText); + } + else { + item.addMetadata(fieldSegments[0], fieldSegments[1], null, lang, translatedText); + } + + try { + item.update(); + results.add(handle + ": Translated " + authLang + " -> " + lang + " (" + field + ")"); + } + catch(Exception e) { + log.info(e.getLocalizedMessage()); + status = Curator.CURATE_ERROR; + } + + } + else { + results.add(handle + ": Failed translation of " + authLang + " -> " + lang + "(" + field + ")"); + } + } + } + } + } + } + + processResults(); + return status; + + } + + protected void initApi() { + /* + * Override this method in your translator + * Only needed to set key, etc. + * apiKey = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.api.key.[service]"); + * + */ + } + + protected String translateText(String from, String to, String text) throws IOException { + + // Override this method in your translator + return null; + } + + private void processResults() throws IOException + { + StringBuilder sb = new StringBuilder(); + sb.append("Translation report: \n----------------\n"); + for(String result : results) + { + sb.append(result).append("\n"); + } + setResult(sb.toString()); + report(sb.toString()); + + } + +} + diff --git a/dspace-api/src/main/java/org/dspace/ctask/general/BasicLinkChecker.java b/dspace-api/src/main/java/org/dspace/ctask/general/BasicLinkChecker.java new file mode 100644 index 0000000000..38bca772a2 --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/ctask/general/BasicLinkChecker.java @@ -0,0 +1,167 @@ +package org.dspace.ctask.general; + +import org.apache.log4j.Logger; +import org.dspace.content.DCValue; +import org.dspace.content.DSpaceObject; +import org.dspace.content.Item; +import org.dspace.curate.AbstractCurationTask; +import org.dspace.curate.Curator; + +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +/** + * A basic link checker that is designed to be extended. By default this link checker + * will check that all links stored in anyschema.anyelement.uri metadata fields return + * a 20x status code. + * + * This link checker can be enhanced by extending this class, and overriding the + * getURLs and checkURL methods. + * + * @author Stuart Lewis + */ + +public class BasicLinkChecker extends AbstractCurationTask +{ + + // The status of the link checking of this item + private int status = Curator.CURATE_UNSET; + + // The results of link checking this item + private List results = null; + + // The log4j logger for this class + private static Logger log = Logger.getLogger(BasicLinkChecker.class); + + + /** + * Perform the link checking. + * + * @param dso The DSpaaceObject to be checked + * @return The curation task status of the checking + * @throws java.io.IOException THrown if something went wrong + */ + @Override + public int perform(DSpaceObject dso) throws IOException + { + // The results that we'll return + StringBuilder results = new StringBuilder(); + + // Unless this is an item, we'll skip this item + status = Curator.CURATE_SKIP; + if (dso instanceof Item) + { + Item item = (Item)dso; + + // Get the URLs + List urls = getURLs(item); + + // Assume skip until we hit a URL to check + status = Curator.CURATE_SKIP; + results.append("Item: ").append(getItemHandle(item)).append("\n"); + + // Check the URLs + for (String url : urls) + { + boolean ok = checkURL(url, results); + + if(ok) + { + status = Curator.CURATE_SUCCESS; + } + else + { + status = Curator.CURATE_FAIL; + } + } + } + + setResult(results.toString()); + report(results.toString()); + + return status; + } + + /** + * Get the URLs to check + * + * @param item The item to extract URLs from + * @return An array of URL Strings + */ + protected List getURLs(Item item) + { + // Get URIs from anyschema.anyelement.uri.* + DCValue[] urls = item.getMetadata(Item.ANY, Item.ANY, "uri", Item.ANY); + ArrayList theURLs = new ArrayList(); + for (DCValue url : urls) + { + theURLs.add(url.value); + } + return theURLs; + } + + /** + * Check the URL and perform appropriate reporting + * + * @param url The URL to check + * @return If the URL was OK or not + */ + protected boolean checkURL(String url, StringBuilder results) + { + // Link check the URL + int httpStatus = getResponseStatus(url); + + if ((httpStatus >= 200) && (httpStatus < 300)) + { + results.append(" - " + url + " = " + httpStatus + " - OK\n"); + return true; + } + else + { + results.append(" - " + url + " = " + httpStatus + " - FAILED\n"); + return false; + } + } + + /** + * Get the response code for a URL. If something goes wrong opening the URL, a + * response code of 0 is returned. + * + * @param url The url to open + * @return The HTTP response code (e.g. 200 / 301 / 404 / 500) + */ + protected int getResponseStatus(String url) + { + try + { + URL theURL = new URL(url); + HttpURLConnection connection = (HttpURLConnection)theURL.openConnection(); + int code = connection.getResponseCode(); + connection.disconnect(); + + return code; + + } catch (IOException ioe) + { + // Must be a bad URL + log.debug("Bad link: " + ioe.getMessage()); + return 0; + } + } + + /** + * Internal utitity method to get a description of the handle + * + * @param item The item to get a description of + * @return The handle, or in workflow + */ + private static String getItemHandle(Item item) + { + String handle = item.getHandle(); + return (handle != null) ? handle: " in workflow"; + } + +} diff --git a/dspace-api/src/main/java/org/dspace/ctask/general/BitstreamsIntoMetadata.java b/dspace-api/src/main/java/org/dspace/ctask/general/BitstreamsIntoMetadata.java new file mode 100644 index 0000000000..4490b3b464 --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/ctask/general/BitstreamsIntoMetadata.java @@ -0,0 +1,125 @@ +package org.dspace.ctask.general; + +import org.apache.log4j.Logger; +import org.dspace.authorize.AuthorizeException; +import org.dspace.content.Bitstream; +import org.dspace.content.Bundle; +import org.dspace.content.DSpaceObject; +import org.dspace.content.Item; +import org.dspace.curate.AbstractCurationTask; +import org.dspace.curate.Curator; + +import java.sql.SQLException; +import java.util.List; + +/** + * A curation job to take bitstream URLs and place them into metadata elements. + * + * @author Stuart Lewis + */ +public class BitstreamsIntoMetadata extends AbstractCurationTask +{ + + // The status of this item + private int status = Curator.CURATE_UNSET; + + // The results of processing this + private List results = null; + + // The log4j logger for this class + private static Logger log = Logger.getLogger(BitstreamsIntoMetadata.class); + + + /** + * Perform the bitstream metadata creation. + * + * @param dso The DSpaceObject to be checked + * @return The curation task status of the checking + */ + @Override + public int perform(DSpaceObject dso) + { + // The results that we'll return + StringBuilder results = new StringBuilder(); + + // Unless this is an item, we'll skip this item + status = Curator.CURATE_SKIP; + boolean changed = false; + logDebugMessage("The target dso is " + dso.getName()); + if (dso instanceof Item) + { + try { + Item item = (Item)dso; + item.clearMetadata("dc", "format", Item.ANY, Item.ANY); + for (Bundle bundle : item.getBundles()) { + if ("ORIGINAL".equals(bundle.getName())) { + for (Bitstream bitstream : bundle.getBitstreams()) { + // Add the metadata and update the item + addMetadata(item, bitstream, "original"); + changed = true; + } + } else if ("THUMBNAIL".equals(bundle.getName())) { + for (Bitstream bitstream : bundle.getBitstreams()) { + // Add the metadata and update the item + addMetadata(item, bitstream, "thumbnail"); + changed = true; + } + } + + if (changed) { + item.update(); + status = Curator.CURATE_SUCCESS; + } + } + } catch (AuthorizeException ae) { + // Something went wrong + logDebugMessage(ae.getMessage()); + status = Curator.CURATE_ERROR; + } catch (SQLException sqle) { + // Something went wrong + logDebugMessage(sqle.getMessage()); + status = Curator.CURATE_ERROR; + } + + } + + logDebugMessage("About to report: " + results.toString()); + setResult(results.toString()); + report(results.toString()); + + return status; + } + + /** + * Debugging logging if required + * + * @param message The message to log + */ + private void logDebugMessage(String message) + { + if (log.isDebugEnabled()) + { + log.debug(message); + } + } + + /** + * Add the bitstream metadata to the item + * + * @param item The item + * @param bitstream The bitstream + * @param type The type of bitstream + */ + private void addMetadata(Item item, Bitstream bitstream, String type) { + String value = bitstream.getFormat().getMIMEType() + "##"; + value += bitstream.getName() + "##"; + value += bitstream.getSize() + "##"; + value += item.getHandle() + "##"; + value += bitstream.getSequenceID() + "##"; + value += bitstream.getChecksum() + "##"; + if (bitstream.getDescription() != null) { + value += bitstream.getDescription(); + } + item.addMetadata("dc", "format", type, "en", value); + } +} diff --git a/dspace-api/src/main/java/org/dspace/ctask/general/MetadataValueLinkChecker.java b/dspace-api/src/main/java/org/dspace/ctask/general/MetadataValueLinkChecker.java new file mode 100644 index 0000000000..fd548ca0c9 --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/ctask/general/MetadataValueLinkChecker.java @@ -0,0 +1,33 @@ +package org.dspace.ctask.general; + +import org.dspace.content.DCValue; +import org.dspace.content.Item; + +import java.util.ArrayList; +import java.util.List; + +/** + * A link checker that builds upon the BasicLinkChecker to check URLs that + * appear in all metadata fields where the field starts with http:// or https:// + * + * Of course thi assumes that there is no extra metadata following the URL. + * + * @author Stuart Lewis + */ +public class MetadataValueLinkChecker extends BasicLinkChecker { + + protected List getURLs(Item item) + { + // Get all metadata elements that start with http:// or https:// + DCValue[] urls = item.getMetadata(Item.ANY, Item.ANY, Item.ANY, Item.ANY); + ArrayList theURLs = new ArrayList(); + for (DCValue url : urls) + { + if ((url.value.startsWith("http://")) || (url.value.startsWith("https://"))) + { + theURLs.add(url.value); + } + } + return theURLs; + } +} diff --git a/dspace-api/src/main/java/org/dspace/ctask/general/MicrosoftTranslator.java b/dspace-api/src/main/java/org/dspace/ctask/general/MicrosoftTranslator.java new file mode 100644 index 0000000000..8bfb5c9453 --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/ctask/general/MicrosoftTranslator.java @@ -0,0 +1,75 @@ +/** + * The contents of this file are subject to the license and copyright + * detailed in the LICENSE and NOTICE files at the root of the source + * tree and available online at + * + * http://www.dspace.org/license/ + */ +package org.dspace.ctask.general; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.methods.GetMethod; +import org.apache.log4j.Logger; +import org.dspace.core.ConfigurationManager; + +import java.io.IOException; +import java.net.URLEncoder; + +/** + * MicrosoftTranslator translates metadata fields using Microsoft Translation API v2 + * + * Requirements: A valid Bing App ID/Key + * More information: http://www.bing.com/developers + * + * This key, and other custom configuration, goes in [dspace]/modules/translator.cfg + * + * @author Kim Shepherd + */ + +public class MicrosoftTranslator extends AbstractTranslator +{ + + private static final String PLUGIN_PREFIX = "translator"; + + private static final String baseUrl = "http://api.microsofttranslator.com/V2/Http.svc/Translate"; + private static String apiKey = ""; + + private static Logger log = Logger.getLogger(MicrosoftTranslator.class); + + + @Override + protected void initApi() { + apiKey = ConfigurationManager.getProperty(PLUGIN_PREFIX, "translate.api.key.microsoft"); + } + + @Override + protected String translateText(String from, String to, String text) throws IOException { + + log.debug("Performing API call to translate from " + from + " to " + to); + + text = URLEncoder.encode(text, "UTF-8"); + + String translatedText = null; + + String url = baseUrl + "?appId=" + apiKey; + url += "&to=" + to + "&from=" + from + "&text=" + text; + + HttpClient client = new HttpClient(); + HttpMethod hm = new GetMethod(url); + int code = client.executeMethod(hm); + log.debug("Response code from API call is " + code); + + if(code == 200) { + String response = hm.getResponseBodyAsString(); + response = response.replaceAll("",""); + response = response.replaceAll("",""); + translatedText = response; + } + + + return translatedText; + } + +} + diff --git a/dspace-api/src/main/java/org/dspace/ctask/general/NoOpCurationTask.java b/dspace-api/src/main/java/org/dspace/ctask/general/NoOpCurationTask.java new file mode 100644 index 0000000000..f2ed08089b --- /dev/null +++ b/dspace-api/src/main/java/org/dspace/ctask/general/NoOpCurationTask.java @@ -0,0 +1,34 @@ +package org.dspace.ctask.general; + +import org.dspace.content.DSpaceObject; +import org.dspace.content.Item; +import org.dspace.curate.AbstractCurationTask; +import org.dspace.curate.Curator; + +import java.io.IOException; + +public class NoOpCurationTask extends AbstractCurationTask +{ + + private int status = Curator.CURATE_UNSET; + private String result = null; + + @Override + public int perform(DSpaceObject dso) throws IOException + { + + if (dso instanceof Item) + { + Item item = (Item)dso; + status = Curator.CURATE_SUCCESS; + result = "No operation performed on " + item.getHandle(); + + setResult(result); + report(result); + } + + return status; + } + + +} diff --git a/dspace/config/modules/curate.cfg b/dspace/config/modules/curate.cfg index d07e144c1b..fdcca15a65 100644 --- a/dspace/config/modules/curate.cfg +++ b/dspace/config/modules/curate.cfg @@ -7,9 +7,12 @@ ### Task Class implementations plugin.named.org.dspace.curate.CurationTask = \ + org.dspace.ctask.general.NoOpCurationTask = noop, \ org.dspace.ctask.general.ProfileFormats = profileformats, \ org.dspace.ctask.general.RequiredMetadata = requiredmetadata, \ - org.dspace.ctask.general.ClamScan = vscan + org.dspace.ctask.general.ClamScan = vscan, \ + org.dspace.ctask.general.MicrosoftTranslator = translate, \ + org.dspace.ctask.general.MetadataValueLinkChecker = checklinks # add new tasks here ## task queue implementation @@ -23,7 +26,8 @@ taskqueue.dir = ${dspace.dir}/ctqueues # be invoked on cmd line, etc - just not in UI ui.tasknames = \ profileformats = Profile Bitstream Formats, \ - requiredmetadata = Check for Required Metadata + requiredmetadata = Check for Required Metadata, \ + checklinks = Check Links in Metadata # Name of queue used when tasks queued in Admin UI ui.queuename = admin_ui diff --git a/dspace/config/modules/translator.cfg b/dspace/config/modules/translator.cfg new file mode 100644 index 0000000000..4f6e2b534a --- /dev/null +++ b/dspace/config/modules/translator.cfg @@ -0,0 +1,35 @@ +## Translation field settings +## +## Authoritative language field +## This will be read to determine the original language an item was submitted in +## Default: dc.language + +translate.field.language = dc.language + +## Metadata fields you wish to have translated +# +translate.field.targets = dc.description.abstract, dc.title, dc.type + +## Translation language settings +## +## If the language field configured in translate.field.language is not present +## in the record, set translate.language.default to a default source language +## or leave blank to use autodetection +# +translate.language.default = en + +## Target languages for translation +# +translate.language.targets = de, fr + +## Translation API settings +## +## Your Bing API v2 key and/or Google "Simple API Access" Key +## (note to Google users: your v1 API key will not work with Translate v2, +## you will need to visit https://code.google.com/apis/console and activate +## a Simple API Access key) +## +## You do not need to enter a key for both services. +# +translate.api.key.microsoft = YOUR_MICROSOFT_API_KEY_GOES_HERE +translate.api.key.google = YOUR_GOOGLE_API_KEY_GOES_HERE \ No newline at end of file