Merge pull request #294 from richardrodgers/DS-1647-MDWebService

[DS-1647] Adds MetadataWebService curation task
2025-10-15 14:03:17 +00:00 · 2013-10-17 11:22:02 -07:00
parent b98637afc7 3ae7117ec2
commit d9590e3d65
2 changed files with 554 additions and 0 deletions
--- a/dspace-api/pom.xml
+++ b/dspace-api/pom.xml
@@ -499,6 +499,17 @@
            <artifactId>coverity-escapers</artifactId>
            <version>1.1.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpcore</artifactId>
            <version>4.1.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!--  Gson: Java to Json conversion -->
        <dependency>
--- a/dspace-api/src/main/java/org/dspace/ctask/general/MetadataWebService.java
+++ b/dspace-api/src/main/java/org/dspace/ctask/general/MetadataWebService.java
@@ -0,0 +1,543 @@
 /**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
 package org.dspace.ctask.general;
 import java.io.InputStream;
 import java.io.IOException;
 import java.sql.SQLException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.namespace.NamespaceContext;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpression;
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
 import javax.xml.XMLConstants;
 import org.apache.http.HttpEntity;
 import org.apache.http.HttpResponse;
 import org.apache.http.HttpStatus;
 import org.apache.http.client.HttpClient;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.impl.client.DefaultHttpClient;
 import org.apache.log4j.Logger;
 import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
 import org.dspace.authorize.AuthorizeException;
 import org.dspace.content.DCValue;
 import org.dspace.content.DSpaceObject;
 import org.dspace.content.Item;
 import org.dspace.core.ConfigurationManager;
 import org.dspace.core.Constants;
 import org.dspace.curate.AbstractCurationTask;
 import org.dspace.curate.Curator;
 import org.dspace.curate.Mutative;
 import org.dspace.curate.Suspendable;
 /**
 * MetadataWebService task calls a web service using metadata from
 * passed item to obtain data. Depending on configuration, this
 * data may be assigned to item metadata fields, or just recorded in the
 * task result string. Task succeeds if web service call succeeds and 
 * configured updates occur, fails if task user not authorized or item
 * lacks metadata to call service, and returns error in all other cases
 * (except skip status for non-item objects).
 * Intended use: cataloging tool in workflow and general curation.
 * The task uses a URL 'template' to compose the service call, e.g.
 * 
 * http://www.sherpa.ac.uk/romeo/api29.php?issn={dc.identifier.issn}
 * 
 * Task will substitute the value of the passed item's metadata field
 * in the {parameter} position. If multiple values are present in the
 * item field, the first value is used.
 * 
 * The task uses another property (the datamap) to determine what data
 * to extract from the service response and how to use it, e.g.
 * 
 * //publisher/name=>dc.publisher,//romeocolour
 * 
 * Task will evaluate the left-hand side (or entire token) of each
 * comma-separated token in the property as an XPath 1.0 expression into
 * the response document, and if there is a mapping symbol (e.g.'=>') and
 * value, it will assign the response document value(s) to the named
 * metadata field in the passed item. If the response document contains
 * multiple values, they will all be assigned to the item field. The
 * mapping symbol governs the nature of metadata field assignment:
 * 
 * '->' mapping will add to any existing values in the item field
 * '=>' mapping will replace any existing values in the item field
 * '~>' mapping will add *only* if item field has no existing values
 * 
 * Unmapped data (without a mapping symbol) will simply be added to the task
 * result string, prepended by the XPath expression (a little prettified).
 * Each label/value pair in the result string is separated by a space, 
 * unless the optional 'separator' property is defined.
 * 
 * A very rudimentary facility for transformation of data is supported, e.g.
 * 
 * http://www.crossref.org/openurl/?id={doi:dc.relation.isversionof}&format=unixref
 *
 * The 'doi:' prefix will cause the task to look for a 'transform' with that
 * name, which is applied to the metadata value before parameter substitution
 * occurs. Transforms are defined in a task property such as the following:
 * 
 * transform.doi = match 10. trunc 60
 * 
 * This means exclude the value string up to the occurrence of '10.', then
 * truncate after 60 characters. The only transform functions currently defined:
 * 
 * 'cut' <number> = remove number leading characters
 * 'trunc' <number> = remove trailing characters after number length
 * 'match' <pattern> = start match at pattern
 * 'text' <characters> = append literal characters (enclose in ' ' when whitespace needed)
 * 
 * If the transform results in an invalid state (e.g. cutting more characters
 * than are in the value), the condition will be logged and the 
 * un-transformed value used.
 *
 * Transforms may also be used in datamaps, e.g.
 * 
 * //publisher/name=>shorten:dc.publisher,//romeocolour
 *  
 * which would apply the 'shorten' transform to the service response value(s)
 * prior to metadata field assignment.
 *
 * An optional property 'headers' may be defined to stipulate any HTTP headers
 * required in the service call. The property syntax is double-pipe separated headers:
 * 
 * Accept: text/xml||Cache-Control: no-cache
 * 
 * @author richardrodgers
 */
@Mutative
@Suspendable
 public class MetadataWebService extends AbstractCurationTask implements NamespaceContext
 {
    /** log4j category */
    private static final Logger log = Logger.getLogger(MetadataWebService.class);
    // transform token parsing pattern
    private static Pattern ttPattern = Pattern.compile("\'([^\']*)\'|(\\S+)");
    // URL of web service with template parameters
    private String urlTemplate = null;
    // template parameter
    private String templateParam = null;
    // Item metadata field to use in service call
    private String lookupField = null;
    // Optional transformation of lookupField
    private String lookupTransform = null;
    // response data to map/record
    private List<DataInfo> dataList = null;
    // response document parsing tools
    private DocumentBuilder docBuilder = null;
    // language for metadata fields assigned
    private String lang = null;
    // field separator in result string
    private String fieldSeparator = null;
    // optional XML namespace map
    private Map<String, String> nsMap = new HashMap<String, String>();
    // optional HTTP headers
    private Map<String, String> headers = new HashMap<String, String>();
    /**
     * Initializes task
     * @param curator  Curator object performing this task
     * @param taskId the configured local name of the task 
     */
    @Override
    public void init(Curator curator, String taskId) throws IOException {
    	super.init(curator, taskId);
    	lang = ConfigurationManager.getProperty("default.language");
        String fldSep = taskProperty("separator");
        fieldSeparator = (fldSep != null) ? fldSep : " ";
    	urlTemplate = taskProperty("template");
    	templateParam = urlTemplate.substring(urlTemplate.indexOf("{") + 1,
    			                              urlTemplate.indexOf("}"));
    	String[] parsed = parseTransform(templateParam);
    	lookupField = parsed[0];
    	lookupTransform = parsed[1];
    	dataList = new ArrayList<DataInfo>();
    	for (String entry : taskProperty("datamap").split(",")) {
    		entry = entry.trim();
    		String src = entry;
    		String mapping = null;
    		String field = null;
    		int mapIdx = getMapIndex(entry);
    		if (mapIdx > 0) {
    			src = entry.substring(0, mapIdx);
    			mapping = entry.substring(mapIdx, mapIdx + 2);
    			field = entry.substring(mapIdx + 2);
    		}
    		int slIdx = src.lastIndexOf("/");
        	String label = (slIdx > 0) ? src.substring(slIdx + 1) : src;
    		dataList.add(new DataInfo(src, label, mapping, field));
    	}
        String hdrs = taskProperty("headers");
        if (hdrs != null) {
            for (String header : hdrs.split("\\|\\|")) {
                int split = header.indexOf(":");
                headers.put(header.substring(0, split).trim(), header.substring(split + 1).trim());
            }
        }
    	// initialize response document parser
    	DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    	factory.setNamespaceAware(true);
    	try {
    		docBuilder = factory.newDocumentBuilder();
    	} catch (ParserConfigurationException pcE) {
    		log.error("caught exception: " + pcE);
    		// no point in continuing
    		throw new IOException(pcE.getMessage(), pcE);
    	}
    }
    /**
     * Perform the curation task upon passed DSO
     *
     * @param dso the DSpace object
     * @throws IOException
     */
    @Override
    public int perform(DSpaceObject dso) throws IOException  {
    	int status = Curator.CURATE_SKIP;
    	StringBuilder resultSb = new StringBuilder();
        if (dso.getType() == Constants.ITEM) {
            Item item = (Item)dso;
            String itemId = item.getHandle();
            if (itemId == null) {
            	// we are still in workflow - no handle assigned - try title
            	DCValue[] titleDc = item.getMetadata("dc", "title", null, Item.ANY);
            	String title = (titleDc.length > 0) ? titleDc[0].value : "untitled - dbId: " + item.getID();
            	itemId = "Workflow item: " + title;
            } else {
                itemId = "handle: " + itemId;
            }
            resultSb.append(itemId);
            // Only proceed if item has a value for service template parameter
            DCValue[] dcVals = item.getMetadata(lookupField);
            if (dcVals.length > 0 && dcVals[0].value.length() > 0) {
            	String value = transform(dcVals[0].value, lookupTransform);
            	status = callService(value, item, resultSb);
            } else {
            	resultSb.append(" lacks metadata value required for service: ").append(lookupField);
            	status = Curator.CURATE_FAIL;
            }
        } else {
           resultSb.append("Object skipped");
        }
        report(resultSb.toString());
        setResult(resultSb.toString());
        return status;
    }
    private int callService(String value, Item item, StringBuilder resultSb) throws IOException {
    	String callUrl = urlTemplate.replaceAll("\\{" + templateParam + "\\}", value);
    	HttpClient client = new DefaultHttpClient();
    	HttpGet req = new HttpGet(callUrl);
        for (Map.Entry<String, String> entry : headers.entrySet()) {
            req.addHeader(entry.getKey(), entry.getValue());
        }
    	HttpResponse resp = client.execute(req);
    	int status = Curator.CURATE_ERROR;
    	int statusCode = resp.getStatusLine().getStatusCode();
    	if (statusCode == HttpStatus.SC_OK) {
    		HttpEntity entity = resp.getEntity();
    		if (entity != null) {
    			// boiler-plate handling taken from Apache 4.1 javadoc
    			InputStream instream = entity.getContent();
                try {
                	Document doc = docBuilder.parse(instream);
                	status = processResponse(doc, item, resultSb);
            	} catch (SAXException saxE) {
            		log.error("caught exception: " + saxE);
            		resultSb.append(" unable to read response document");
                } catch (RuntimeException ex) {
                	// In case of an unexpected exception you may want to abort
                	// the HTTP request in order to shut down the underlying
                	// connection and release it back to the connection manager.
                	req.abort();
                	log.error("caught exception: " + ex);
                	throw ex;
                } finally {
                	// Closing the input stream will trigger connection release
                	instream.close();
                }
                // When HttpClient instance is no longer needed,
                // shut down the connection manager to ensure
                // immediate deallocation of all system resources
                client.getConnectionManager().shutdown();
    		} else {
    			log.error(" obtained no valid service response");
    			resultSb.append("no service response");
    		}
    	} else {
    		log.error("service returned non-OK status: " + statusCode);
    		resultSb.append("no service response");
    	}
    	return status;
    }
    private int processResponse(Document doc, Item item, StringBuilder resultSb) throws IOException {
       	boolean update = false;
       	int status = Curator.CURATE_ERROR;
       	List<String> values = new ArrayList<String>();
       	checkNamespaces(doc);
       	try {
       		for (DataInfo info : dataList) {
       			NodeList nodes = (NodeList)info.expr.evaluate(doc, XPathConstants.NODESET);
       			values.clear();
       			// if data found and we are mapping, check assignment policy
       			if (nodes.getLength() > 0 && info.mapping != null) {
       				if ("=>".equals(info.mapping)) {
       					item.clearMetadata(info.schema, info.element, info.qualifier, Item.ANY);
       				} else if ("~>".equals(info.mapping)) {
       					if (item.getMetadata(info.schema, info.element, info.qualifier, Item.ANY).length > 0) {
       						// there are values, so don't overwrite
       						continue;
       					}
       				} else {
       					for (DCValue dcVal : item.getMetadata(info.schema, info.element, info.qualifier, Item.ANY)) {
       						values.add(dcVal.value);
       					}
       				}
       			}
       			for (int i = 0; i < nodes.getLength(); i++) {
       				Node node = nodes.item(i);
       				String tvalue = transform(node.getFirstChild().getNodeValue(), info.transform);
       				// assign to metadata field if mapped && not present
       				if (info.mapping != null && ! values.contains(tvalue)) {
       					item.addMetadata(info.schema, info.element, info.qualifier, lang, tvalue);
       					update = true;
       				}
       				// add to result string in any case
       				resultSb.append(fieldSeparator).append(info.label).append(": ").append(tvalue);
       			}
       		}
       		// update Item if it has changed
       		if (update) {
       			item.update();
       		}
       		status = Curator.CURATE_SUCCESS;
       	} catch (AuthorizeException authE) {
    		log.error("caught exception: " + authE);
    		resultSb.append(" not authorized to update");
    		status = Curator.CURATE_FAIL;
       	} catch (SQLException sqlE) {
    		log.error("caught exception: " + sqlE);
    		resultSb.append(" error updating metadata");
       	} catch (XPathExpressionException xpeE) {
    		log.error("caught exception: " + xpeE);
    		resultSb.append(" error reading response document");
       	}
        return status;
    }
    private String transform(String value, String transDef) {
    	if (transDef == null) {
    		return value;
    	}
    	String[] tokens = tokenize(transDef);
    	String retValue = value;
    	for (int i = 0; i < tokens.length; i+= 2) {
    		if ("cut".equals(tokens[i]) || "trunc".equals(tokens[i])) {
    			int index = Integer.parseInt(tokens[i+1]);
    			if (retValue.length() > index) {
    				if ("cut".equals(tokens[i])) {
    					retValue = retValue.substring(index);
    				} else {
    					retValue = retValue.substring(0, index);
    				}
    			} else if ("cut".equals(tokens[i])) {
    				log.error("requested cut: " + index + " exceeds value length");
    				return value;
    			}
    		} else if ("match".equals(tokens[i])) {
    			int index2 = retValue.indexOf(tokens[i+1]);
    			if (index2 > 0) {
    				retValue = retValue.substring(index2);
    			} else {
    				log.error("requested match: " + tokens[i+1] + " failed");
    				return value;
    			}
    		} else if ("text".equals(tokens[i])) {
    			retValue = retValue + tokens[i+1];
    		} else {
    			log.error(" unknown transform operation: " + tokens[i]);
    			return value;
    		}
    	}
    	return retValue;
    }
    private String[] tokenize(String text)  {
    	List<String> list = new ArrayList<String>();
    	Matcher m = ttPattern.matcher(text);
    	while (m.find()) {
    		if (m.group(1) != null) {
    			list.add(m.group(1));
            } else if (m.group(2) != null) {
                list.add(m.group(2));
            }
        }
        return list.toArray(new String[0]);
    }
    private int getMapIndex(String mapping) {
    	int index = mapping.indexOf("->");
    	if (index == -1) {
    		index = mapping.indexOf("=>");
    	}
    	if (index == -1) {
    		index = mapping.indexOf("~>");
    	}
    	return index;
    }
    private String[] parseTransform(String field) {
    	String[] parsed = new String[2];
    	parsed[0] = field;
       	int txIdx = field.indexOf(":");
    	if (txIdx > 0) {
    		// transform specified
    		String txName = field.substring(0, txIdx);
    		parsed[1] = taskProperty("transform." + txName);
    		if (parsed[1] == null) {
    			log.error("no transform found for: " + txName);
    		}
    		parsed[0] = field.substring(txIdx + 1);
    	}
    	return parsed;
    }
    private void checkNamespaces(Document document) throws IOException {
    	// skip if already done
    	if (dataList.get(0).expr != null) {
    	    return;
    	}
    	try {
    	    XPath xpath = XPathFactory.newInstance().newXPath();
    	    String prefix = null;
            NamedNodeMap attrs = document.getDocumentElement().getAttributes();
    		for (int i = 0; i < attrs.getLength(); i++) {
    		    Node n = attrs.item(i);
                String name = n.getNodeName();
                // remember if a namespace
                if (name.startsWith("xmlns")) {
                    if (! "xmlns".equals(name)) {
                        // it is a declared (non-default) namespace - capture prefix
                        nsMap.put(name.substring(name.indexOf(":") + 1), n.getNodeValue());
                    } else {
                        // it is the default name space - mint a unique prefix
                        prefix = "pre";
                        nsMap.put(prefix, n.getNodeValue());
                    }
                }
            }
            if (nsMap.size() > 0) {
    		    xpath.setNamespaceContext(this);
            }
    		// now compile the XPath expressions
    		for (DataInfo info : dataList) {
 			    info.expr = xpath.compile(mangleExpr(info.xpsrc, prefix));
 		    }
    	} catch (XPathExpressionException xpeE) {
    		log.error("caught exception: " + xpeE);
        	// no point in continuing
        	throw new IOException(xpeE.getMessage(), xpeE);  			
    	}
    }
    private String mangleExpr(String expr, String prefix) {
    	if (prefix == null) {
    		return expr;
    	}
    	// OK the drill is to prepend all node names with the prefix
    	// *unless* the node name already has a prefix.
    	StringBuilder sb = new StringBuilder();
    	int i = 0;
    	while (i < expr.length()) {
    		if (expr.charAt(i) == '/') {
    			sb.append("/");
    			i++;
    		} else {
    			int next = expr.indexOf("/", i);
    			String token = (next > 0) ? expr.substring(i, next) : expr.substring(i);
    			if (! token.startsWith("@") && token.indexOf(":") < 0) {
    				sb.append(prefix).append(":");
    			}
    			sb.append(token);
    			i += token.length();
    		}
    	}
    	return sb.toString();
    }
    // ---- NamespaceContext methods ---- //
    public String getNamespaceURI(String prefix) {
        if (prefix == null) {
        	throw new NullPointerException("Null prefix");
        } else if ("xml".equals(prefix)) {
        	return XMLConstants.XML_NS_URI;
        }
        String nsURI = nsMap.get(prefix);
        return (nsURI != null) ? nsURI : XMLConstants.NULL_NS_URI;
    }
    public String getPrefix(String uri) {
        throw new UnsupportedOperationException();
    }
    public Iterator getPrefixes(String uri) {
        throw new UnsupportedOperationException();
    }
    private class DataInfo {
    	public XPathExpression expr; // compiled XPath espression for data
    	public String xpsrc;		// uncompiled XPath expression 
    	public String label;		// label for data in result string
    	public String mapping;		// data mapping symbol: ->,=>,~>, or null = unmapped
    	public String schema;		// item metadata field mapping target, null = unmapped
    	public String element;		// item metadata field mapping target, null = unmapped
    	public String qualifier;	// item metadata field mapping target, null = unmapped
    	public String transform;	// optional transformation of data before field assignment
    	public DataInfo(String xpsrc, String label, String mapping, String field) {
    		this.xpsrc = xpsrc;
    		this.expr = expr;
    		this.label = label;
    		this.mapping = mapping;
    		if (field != null) {
    			String[] parsed = parseTransform(field);
    			String[] parts = parsed[0].split("\\.");
    			this.schema = parts[0];
    			this.element = parts[1];
    			this.qualifier = (parts.length == 3) ? parts[2] : null;
    			this.transform = parsed[1];
    		}
    	}
    }
 }