Merge pull request #392 from zuki/DS-1790

DS-1790 A new dataloader for CiNii article
This commit is contained in:
Kostas Stamatis
2013-12-04 22:22:07 -08:00
10 changed files with 837 additions and 0 deletions

View File

@@ -0,0 +1,172 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.submit.lookup;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.dspace.app.util.XMLUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.xml.sax.SAXException;
import gr.ekt.bte.core.DataLoadingSpec;
import gr.ekt.bte.core.Record;
import gr.ekt.bte.core.RecordSet;
import gr.ekt.bte.core.Value;
import gr.ekt.bte.dataloader.FileDataLoader;
import gr.ekt.bte.exceptions.MalformedSourceException;
/**
* Load metadata from CiNii formated file
*
* @author Keiji Suzuki
*
*/
public class CiNiiFileDataLoader extends FileDataLoader
{
private static Logger log = Logger.getLogger(CiNiiFileDataLoader.class);
Map<String, String> fieldMap; // mapping between service fields and local
// intermediate fields
/**
* Empty constructor
*/
public CiNiiFileDataLoader()
{
}
/**
* @param filename
*/
public CiNiiFileDataLoader(String filename)
{
super(filename);
}
/*
* (non-Javadoc)
*
* @see gr.ekt.bte.core.DataLoader#getRecords()
*/
@Override
public RecordSet getRecords() throws MalformedSourceException
{
RecordSet recordSet = new RecordSet();
try
{
InputStream inputStream = new FileInputStream(new File(filename));
DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
factory.setValidating(false);
factory.setIgnoringComments(true);
factory.setIgnoringElementContentWhitespace(true);
DocumentBuilder db = factory.newDocumentBuilder();
Document inDoc = db.parse(inputStream);
Element xmlRoot = inDoc.getDocumentElement();
// There is no element to represent an record, so we can not process
// multi records at once.
Record record = CiNiiUtils.convertCiNiiDomToRecord(xmlRoot);
if (record != null)
{
recordSet.addRecord(convertFields(record));
}
}
catch (FileNotFoundException e)
{
log.error(e.getMessage(), e);
}
catch (ParserConfigurationException e)
{
log.error(e.getMessage(), e);
}
catch (SAXException e)
{
log.error(e.getMessage(), e);
}
catch (IOException e)
{
log.error(e.getMessage(), e);
}
return recordSet;
}
/*
* (non-Javadoc)
*
* @see
* gr.ekt.bte.core.DataLoader#getRecords(gr.ekt.bte.core.DataLoadingSpec)
*/
@Override
public RecordSet getRecords(DataLoadingSpec spec)
throws MalformedSourceException
{
if (spec.getOffset() > 0)
{
return new RecordSet();
}
return getRecords();
}
public Record convertFields(Record publication)
{
for (String fieldName : fieldMap.keySet())
{
String md = null;
if (fieldMap != null)
{
md = this.fieldMap.get(fieldName);
}
if (StringUtils.isBlank(md))
{
continue;
}
else
{
md = md.trim();
}
if (publication.isMutable())
{
List<Value> values = publication.getValues(fieldName);
publication.makeMutable().removeField(fieldName);
publication.makeMutable().addField(md, values);
}
}
return publication;
}
public void setFieldMap(Map<String, String> fieldMap)
{
this.fieldMap = fieldMap;
}
}

View File

@@ -0,0 +1,120 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.submit.lookup;
import gr.ekt.bte.core.Record;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.httpclient.HttpException;
import org.dspace.core.Context;
/**
* Load metadata from CiNii RDF API
* @author Keiji Suzuki
*/
public class CiNiiOnlineDataLoader extends NetworkSubmissionLookupDataLoader
{
private CiNiiService ciniiService = new CiNiiService();
private boolean searchProvider = true;
/** Application id to use CiNii */
private String appId = null;
/** max result number to return */
private int maxResults = 10;
public void setCiNiiService(CiNiiService ciniiService)
{
this.ciniiService = ciniiService;
}
@Override
public List<String> getSupportedIdentifiers()
{
return Arrays.asList(new String[] { CINII });
}
public void setSearchProvider(boolean searchProvider)
{
this.searchProvider = searchProvider;
}
@Override
public boolean isSearchProvider()
{
return searchProvider;
}
@Override
public List<Record> getByIdentifier(Context context,
Map<String, Set<String>> keys) throws HttpException, IOException
{
if (appId == null)
{
throw new RuntimeException("No CiNii Application ID is specified!");
}
List<Record> results = new ArrayList<Record>();
if (keys != null)
{
Set<String> ciniiids = keys.get(CINII);
if (ciniiids != null && ciniiids.size() > 0)
{
for (String ciniiid : ciniiids)
{
Record record = ciniiService.getByCiNiiID(ciniiid, getAppId());
if (record != null)
{
results.add(convertFields(record));
}
}
}
}
return results;
}
@Override
public List<Record> search(Context context, String title, String author, int year)
throws HttpException, IOException
{
if (appId == null)
{
throw new RuntimeException("No CiNii Application ID is specified!");
}
return ciniiService.searchByTerm(title, author, year,
getMaxResults(), getAppId());
}
public String getAppId()
{
return appId;
}
public void setAppId(String appId)
{
this.appId = appId;
}
public int getMaxResults()
{
return maxResults;
}
public void setMaxResults(int maxResults)
{
this.maxResults = maxResults;
}
}

View File

@@ -0,0 +1,221 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.submit.lookup;
import gr.ekt.bte.core.Record;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.dspace.app.util.XMLUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
/**
* @author Keiji Suzuki
*/
public class CiNiiService
{
/** log4j category */
private static Logger log = Logger.getLogger(CiNiiService.class);
private int timeout = 1000;
public void setTimeout(int timeout)
{
this.timeout = timeout;
}
public Record getByCiNiiID(String id, String appId) throws HttpException,
IOException
{
return search(id, appId);
}
public List<Record> searchByTerm(String title, String author, int year,
int maxResults, String appId)
throws HttpException, IOException
{
List<Record> records = new ArrayList<Record>();
List<String> ids = getCiNiiIDs(title, author, year, maxResults, appId);
if (ids != null && ids.size() > 0)
{
for (String id : ids)
{
Record record = search(id, appId);
if (record != null)
{
records.add(record);
}
}
}
return records;
}
/**
* Get metadata by searching CiNii RDF API with CiNii NAID
*
*/
private Record search(String id, String appId)
throws IOException, HttpException
{
GetMethod method = null;
try
{
HttpClient client = new HttpClient();
client.setTimeout(timeout);
method = new GetMethod("http://ci.nii.ac.jp/naid/"+id+".rdf?appid="+appId);
// Execute the method.
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK)
{
if (statusCode == HttpStatus.SC_BAD_REQUEST)
throw new RuntimeException("CiNii RDF is not valid");
else
throw new RuntimeException("CiNii RDF Http call failed: "
+ method.getStatusLine());
}
try
{
DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
factory.setValidating(false);
factory.setIgnoringComments(true);
factory.setIgnoringElementContentWhitespace(true);
DocumentBuilder db = factory.newDocumentBuilder();
Document inDoc = db.parse(method.getResponseBodyAsStream());
Element xmlRoot = inDoc.getDocumentElement();
return CiNiiUtils.convertCiNiiDomToRecord(xmlRoot);
}
catch (Exception e)
{
throw new RuntimeException(
"CiNii RDF identifier is not valid or not exist");
}
}
finally
{
if (method != null)
{
method.releaseConnection();
}
}
}
/**
* Get CiNii NAIDs by searching CiNii OpenURL API with title, author and year
*
*/
private List<String> getCiNiiIDs(String title, String author, int year,
int maxResults, String appId)
throws IOException, HttpException
{
// Need at least one query term
if (title == null && author == null && year == -1)
{
return null;
}
GetMethod method = null;
List<String> ids = new ArrayList<String>();
try
{
HttpClient client = new HttpClient();
client.setTimeout(timeout);
StringBuilder query = new StringBuilder();
query.append("format=rss&appid=").append(appId)
.append("&count=").append(maxResults);
if (title != null)
{
query.append("&title=").append(URLEncoder.encode(title, "UTF-8"));
}
if (author != null)
{
query.append("&author=").append(URLEncoder.encode(author, "UTF-8"));
}
if (year != -1)
{
query.append("&year_from=").append(String.valueOf(year));
query.append("&year_to=").append(String.valueOf(year));
}
method = new GetMethod("http://ci.nii.ac.jp/opensearch/search?"+query.toString());
// Execute the method.
int statusCode = client.executeMethod(method);
if (statusCode != HttpStatus.SC_OK)
{
if (statusCode == HttpStatus.SC_BAD_REQUEST)
throw new RuntimeException("CiNii OpenSearch query is not valid");
else
throw new RuntimeException("CiNii OpenSearch call failed: "
+ method.getStatusLine());
}
try
{
DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
factory.setValidating(false);
factory.setIgnoringComments(true);
factory.setIgnoringElementContentWhitespace(true);
DocumentBuilder db = factory.newDocumentBuilder();
Document inDoc = db.parse(method.getResponseBodyAsStream());
Element xmlRoot = inDoc.getDocumentElement();
List<Element> items = XMLUtils.getElementList(xmlRoot, "item");
int url_len = "http://ci.nii.ac.jp/naid/".length();
for (Element item : items)
{
String about = item.getAttribute("rdf:about");
if (about.length() > url_len)
{
ids.add(about.substring(url_len));
}
}
return ids;
}
catch (Exception e)
{
throw new RuntimeException(
"CiNii OpenSearch results is not valid or not exist");
}
}
finally
{
if (method != null)
{
method.releaseConnection();
}
}
}
}

View File

@@ -0,0 +1,268 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
/**
*
*/
package org.dspace.submit.lookup;
import java.util.LinkedList;
import java.util.List;
import gr.ekt.bte.core.MutableRecord;
import gr.ekt.bte.core.Record;
import gr.ekt.bte.core.StringValue;
import gr.ekt.bte.core.Value;
import org.apache.commons.lang.StringUtils;
import org.dspace.app.util.XMLUtils;
import org.dspace.submit.util.SubmissionLookupPublication;
import org.w3c.dom.Element;
/**
*
* @author Keiji Suzuki
*
*/
public class CiNiiUtils
{
public static Record convertCiNiiDomToRecord(Element xmlRoot)
{
MutableRecord record = new SubmissionLookupPublication("");
List<Element> list = XMLUtils.getElementList(xmlRoot, "rdf:Description");
// Valid CiNii record should have three rdf:Description elements
if (list.size() < 3)
{
return record;
}
Element description_ja = list.get(0); // Japanese description
Element description_en = list.get(1); // English description
// Element description3 = list.get(2); // Authors information: NOT USE here
String language = XMLUtils.getElementValue(description_ja, "dc:language");
language = language != null ? language.toLowerCase() : "ja";
record.addValue("language", new StringValue(language));
if ("ja".equals(language) || "jpn".equals(language))
{
String title = XMLUtils.getElementValue(description_ja, "dc:title");
if (title != null)
{
record.addValue("title", new StringValue(title));
}
String titleAlternative = XMLUtils.getElementValue(description_en, "dc:title");
if (titleAlternative != null)
{
record.addValue("titleAlternative", new StringValue(titleAlternative));
}
List<Value> authors = getAuthors(description_ja);
if (authors.size() > 0)
{
record.addField("authors", authors);
}
List<Value> authorAlternative = getAuthors(description_en);
if (authorAlternative.size() > 0)
{
record.addField("auhtorAlternative", authorAlternative);
}
String publisher = XMLUtils.getElementValue(description_ja, "dc:publisher");
if (publisher != null)
{
record.addValue("publisher", new StringValue(publisher));
}
}
else
{
String title = XMLUtils.getElementValue(description_en, "dc:title");
if (title != null)
{
record.addValue("title", new StringValue(title));
}
String titleAlternative = XMLUtils.getElementValue(description_ja, "dc:title");
if (titleAlternative != null)
{
record.addValue("titleAlternative", new StringValue(titleAlternative));
}
List<Value> authors = getAuthors(description_en);
if (authors.size() > 0)
{
record.addField("authors", authors);
}
List<Value> authorAlternative = getAuthors(description_ja);
if (authorAlternative.size() > 0)
{
record.addField("authorAlternative", authorAlternative);
}
String publisher = XMLUtils.getElementValue(description_en, "dc:publisher");
if (publisher != null)
{
record.addValue("publisher", new StringValue(publisher));
}
}
String abstract_ja = XMLUtils.getElementValue(description_ja, "dc:description");
String abstract_en = XMLUtils.getElementValue(description_en, "dc:description");
if (abstract_ja != null && abstract_en != null)
{
List<Value> description = new LinkedList<Value>();
description.add(new StringValue(abstract_ja));
description.add(new StringValue(abstract_en));
record.addField("description", description);
}
else if (abstract_ja != null)
{
record.addValue("description", new StringValue(abstract_ja));
}
else if (abstract_en != null)
{
record.addValue("description", new StringValue(abstract_en));
}
List<Value> subjects = getSubjects(description_ja);
subjects.addAll(getSubjects(description_en));
if (subjects.size() > 0)
{
record.addField("subjects", subjects);
}
String journal_j = XMLUtils.getElementValue(description_ja, "prism:publicationName");
String journal_e = XMLUtils.getElementValue(description_en, "prism:publicationName");
if (journal_j != null && journal_e != null)
{
record.addValue("journal", new StringValue(journal_j+" = "+journal_e));
}
else if (journal_j != null)
{
record.addValue("journal", new StringValue(journal_j));
}
else if (journal_e != null)
{
record.addValue("journal", new StringValue(journal_e));
}
String volume = XMLUtils.getElementValue(description_ja, "prism:volume");
if (volume != null)
{
record.addValue("volume", new StringValue(volume));
}
String issue = XMLUtils.getElementValue(description_ja, "prism:number");
if (issue != null)
{
record.addValue("issue", new StringValue(issue));
}
String spage = XMLUtils.getElementValue(description_ja, "prism:startingPage");
if (spage != null)
{
record.addValue("spage", new StringValue(spage));
}
String epage = XMLUtils.getElementValue(description_ja, "prism:endingPage");
if (epage != null)
{
record.addValue("epage", new StringValue(epage));
}
String pages = XMLUtils.getElementValue(description_ja, "prism:pageRange");
if (pages != null && spage == null)
{
int pos = pages.indexOf("-");
if (pos > -1)
{
spage = pages.substring(0, pos);
epage = pages.substring(pos+1, pages.length() - pos);
if (!epage.equals("") && spage.length() > epage.length())
{
epage = spage.substring(0, spage.length() - epage.length()) + epage;
}
}
else
{
spage = pages;
epage = "";
}
record.addValue("spage", new StringValue(spage));
if (!epage.equals("") && epage == null)
{
record.addValue("epage", new StringValue(epage));
}
}
String issn = XMLUtils.getElementValue(description_ja, "prism:issn");
if (issn != null)
{
record.addValue("issn", new StringValue(issn));
}
String issued = XMLUtils.getElementValue(description_ja, "prism:publicationDate");
if (issued != null)
{
record.addValue("issued", new StringValue(issued));
}
String ncid = XMLUtils.getElementValue(description_ja, "cinii:ncid");
if (ncid != null)
{
record.addValue("ncid", new StringValue(ncid));
}
String naid = XMLUtils.getElementValue(description_ja, "cinii:naid");
if (naid != null)
{
record.addValue("naid", new StringValue(naid));
}
return record;
}
private static List<Value> getAuthors(Element element)
{
List<Value> authors = new LinkedList<Value>();
List<String> authorList = XMLUtils.getElementValueList(element, "dc:creator");
if (authorList != null && authorList.size() > 0)
{
for (String author : authorList)
{
int pos = author.indexOf(" ");
if (pos > -1)
author = author.substring(0, pos) + "," + author.substring(pos);
authors.add(new StringValue(author));
}
}
return authors;
}
private static List<Value> getSubjects(Element element)
{
List<Value> subjects = new LinkedList<Value>();
List<Element> topicList = XMLUtils.getElementList(element, "foaf:topic");
String attrValue = null;
for (Element topic : topicList)
{
attrValue = topic.getAttribute("dc:title");
if (StringUtils.isNotBlank(attrValue))
{
subjects.add(new StringValue(attrValue.trim()));
}
}
return subjects;
}
}

View File

@@ -37,6 +37,8 @@ public interface SubmissionLookupDataLoader extends DataLoader
public final static String SCOPUSEID = "scopuseid";
public final static String CINII = "cinii";
public final static String TYPE = "type";
List<String> getSupportedIdentifiers();

View File

@@ -1719,6 +1719,8 @@ jsp.submit.start-lookup-submission.identifier-pubmed = PubMed ID
jsp.submit.start-lookup-submission.identifier-pubmed.hint = e.g. 20524090
jsp.submit.start-lookup-submission.identifier-arxiv = arXiv ID
jsp.submit.start-lookup-submission.identifier-arxiv.hint = e.g. arXiv:1302.1497
jsp.submit.start-lookup-submission.identifier-cinii = CiNii NAID
jsp.submit.start-lookup-submission.identifier-cinii.hint = e.g. 110004744915
jsp.submit.start-lookup-submission.search = Free search
jsp.submit.start-lookup-submission.search.hints = Insert base info about publication: either <b>title</b> or <b>author/year</b> is required.<br/>If you know any unique identifier about publication like <b>DOI</b>, <b>Pubmed</b>, or <b>arXiv</b> you can switch on the <span id="link-ricerca-identificatore">identifier search mode</span>.

View File

@@ -385,6 +385,7 @@ public class SubmissionLookupJSONRequest extends JSONRequest
defaultValues.add("journal");
defaultValues.add("volume");
defaultValues.add("issue");
defaultValues.add("publisher");
defaultValues.add("jissn");
defaultValues.add("pisbn");
defaultValues.add("eisbn");

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.9 KiB

View File

@@ -20,6 +20,7 @@
<entry key="pubmedXML" value-ref="pubmedFileDataLoader" />
<entry key="crossrefXML" value-ref="crossRefFileDataLoader" />
<entry key="arxivXML" value-ref="arXivFileDataLoader" />
<entry key="ciniiXML" value-ref="ciniiFileDataLoader" />
<entry key="bibtex" value-ref="bibTeXDataLoader" />
<entry key="ris" value-ref="risDataLoader" />
<entry key="endnote" value-ref="endnoteDataLoader" />
@@ -76,6 +77,7 @@
<value>journal</value>
<value>volume</value>
<value>issue</value>
<value>publisher</value>
<value>jissn</value>
<value>jeissn</value>
<value>pisbn</value>
@@ -110,9 +112,11 @@
<entry key="pubmed" value-ref="pubmedOnlineDataLoader"/>
<entry key="crossref" value-ref="crossRefOnlineDataLoader"/>
<entry key="arxiv" value-ref="arXivOnlineDataLoader"/>
<entry key="cinii" value-ref="ciniiOnlineDataLoader"/>
<entry key="pubmedXML" value-ref="pubmedFileDataLoader"/>
<entry key="crossRefXML" value-ref="crossRefFileDataLoader"/>
<entry key="arXivXML" value-ref="arXivFileDataLoader"/>
<entry key="ciniiXML" value-ref="ciniiFileDataLoader"/>
<entry key="bibtex" value-ref="bibTeXDataLoader"/>
<entry key="ris" value-ref="risDataLoader"/>
<entry key="endnote" value-ref="endnoteDataLoader"/>
@@ -475,6 +479,48 @@
</constructor-arg>
</bean>
<!-- CiNii -->
<bean id="ciniiOnlineDataLoader" class="org.dspace.submit.lookup.CiNiiOnlineDataLoader">
<property name="searchProvider" value="false" />
<!-- For CiNii service you need to obtain an Application ID from NII.
Once you get it, add it to the following configuration value.
For details, see http://ci.nii.ac.jp/info/en/api/developer.html
-->
<property name="appId" value="" />
<!-- Uncomment the following line if you want to define the max results
returned by the CiNii free text (by author, title, date) search.
Default value is 10
-->
<!-- <property name="maxResults" value="10" /> -->
<property name="fieldMap" ref="ciniiInputMap" />
</bean>
<bean id="ciniiFileDataLoader" class="org.dspace.submit.lookup.CiNiiFileDataLoader">
<property name="fieldMap" ref="ciniiInputMap" />
</bean>
<bean name="ciniiInputMap" class="java.util.HashMap" scope="prototype">
<constructor-arg>
<map key-type="java.lang.String" value-type="java.lang.String">
<entry key="naid" value="naid" />
<entry key="ncid" value="ncid" />
<entry key="issn" value="jissn" />
<entry key="journal" value="journal" />
<entry key="title" value="title" />
<entry key="issued" value="issued" />
<entry key="volume" value="volume" />
<entry key="issue" value="issue" />
<entry key="spage" value="firstpage" />
<entry key="epage" value="lastpage" />
<entry key="language" value="language" />
<entry key="description" value="abstract" />
<entry key="subjects" value="keywords" />
<entry key="authors" value="authors" />
<entry key="publisher" value="publisher" />
</map>
</constructor-arg>
</bean>
<!-- **************************************************************************************************** -->
<!-- Output Mapping -->
<!-- **************************************************************************************************** -->
@@ -503,6 +549,7 @@
<entry value="allkeywords" key="dc.subject" />
<entry value="arxivCategory" key="dc.subject" />
<entry value="doi" key="dc.identifier" />
<entry value="publisher" key="dc.publisher" />
<!-- Not used - new metadata fields need to be declared for them in DSpace registry -->
<!--
<entry value="url" key="" />
@@ -520,6 +567,10 @@
<entry value="editionnumber" key="" />
<entry value="seriestitle" key="" />
<entry value="volumetitle" key="" />
<entry value="titleAlternative" key="" />
<entry value="authorAlternative" key="" />
<entry value="ncid" key="" />
<entry value="naid" key="" />
-->
</map>