mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-17 06:53:09 +00:00
[DS-733] Load testing utilities
git-svn-id: http://scm.dspace.org/svn/repo/dspace/trunk@5700 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
231
dspace-api/src/main/java/org/dspace/testing/PubMedToImport.java
Normal file
231
dspace-api/src/main/java/org/dspace/testing/PubMedToImport.java
Normal file
@@ -0,0 +1,231 @@
|
||||
package org.dspace.testing;
|
||||
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
import org.apache.commons.cli.Option;
|
||||
import org.apache.commons.cli.Options;
|
||||
import org.apache.commons.cli.PosixParser;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.dspace.content.DCValue;
|
||||
import org.jdom.Document;
|
||||
import org.jdom.Element;
|
||||
import org.jdom.output.Format;
|
||||
import org.jdom.output.XMLOutputter;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Simple class to transform a medline.xml file from PubMed into DSpace import package(s)
|
||||
*
|
||||
* This is a distinctly incomplete implementation - it doesn't even attempt to map a number of fields,
|
||||
* and has no means of customizing the mapping. More importantly, it makes assumptions in parsing the xml
|
||||
* that would be problematic for a production instance.
|
||||
*
|
||||
* However, it does use SAX parsing, which means it has no problems with handling a 1GB+ input file.
|
||||
* This means it is a good way to generate a large number of realistic import packages very quickly -
|
||||
* simply go to http://www.ncbi.nlm.nih.gov/pubmed and search for something that returns a lot of records
|
||||
* ('nature' returns over 300,000 for example). Download the results as a medline.xml (and yes, it will attempt
|
||||
* to download all 300,000+ into a single file), and then run this class over that file to spit out import packages
|
||||
* which can then be loaded into DSpace using ItemImport.
|
||||
*/
|
||||
public class PubMedToImport {
|
||||
private static final Logger log = Logger.getLogger(PubMedToImport.class);
|
||||
|
||||
private static File outputDir = null;
|
||||
|
||||
public static void main(String args[]) {
|
||||
Options options = new Options();
|
||||
|
||||
options.addOption(new Option("s", "source", true, "Source xml"));
|
||||
options.addOption(new Option("o", "output", true, "Output directory"));
|
||||
|
||||
try {
|
||||
CommandLine cli = new PosixParser().parse(options, args);
|
||||
|
||||
String source = cli.getOptionValue("s");
|
||||
String output = cli.getOptionValue("o");
|
||||
|
||||
if (!new File(source).exists()) {
|
||||
throw new IllegalArgumentException("Source file does not exist");
|
||||
}
|
||||
|
||||
outputDir = new File(output);
|
||||
if (outputDir.exists()) {
|
||||
if (outputDir.list().length > 0) {
|
||||
throw new IllegalStateException("Output directory must be empty");
|
||||
}
|
||||
} else {
|
||||
if (!outputDir.mkdirs()) {
|
||||
throw new IllegalStateException("Unable to create output directory");
|
||||
}
|
||||
}
|
||||
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
SAXParser saxParser = factory.newSAXParser();
|
||||
|
||||
saxParser.parse(source, new PubMedHandler());
|
||||
|
||||
} catch (Exception e) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static class PubMedHandler extends DefaultHandler {
|
||||
private static int recordCount = 1;
|
||||
private static List<DCValue> dcValues;
|
||||
|
||||
private static StringBuilder value;
|
||||
private static StringBuilder lastName;
|
||||
private static StringBuilder firstName;
|
||||
|
||||
private static boolean isCorrection = false;
|
||||
private static boolean isLastName = false;
|
||||
private static boolean isFirstName = false;
|
||||
|
||||
private static void addDCValue(String element, String qualifier, String value) {
|
||||
if (dcValues == null) {
|
||||
dcValues = new ArrayList<DCValue>();
|
||||
}
|
||||
|
||||
DCValue thisValue = new DCValue();
|
||||
thisValue.schema = "dc";
|
||||
thisValue.element = element;
|
||||
thisValue.qualifier = qualifier;
|
||||
thisValue.value = value;
|
||||
|
||||
dcValues.add(thisValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException
|
||||
{
|
||||
if ("PubmedArticle".equals(qName)) {
|
||||
System.out.println("Starting record " + recordCount);
|
||||
} else if ("CommensCorrectionsList".equals(qName)) {
|
||||
isCorrection = true;
|
||||
} else if ("ForeName".equals(qName)) {
|
||||
isFirstName = true;
|
||||
firstName = new StringBuilder();
|
||||
} else if ("LastName".equals(qName)) {
|
||||
isLastName = true;
|
||||
lastName = new StringBuilder();
|
||||
} else {
|
||||
value = new StringBuilder();
|
||||
}
|
||||
|
||||
super.startElement(uri, localName, qName, attributes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String qName) throws SAXException
|
||||
{
|
||||
if (!isCorrection) {
|
||||
if ("PMID".equals(qName)) {
|
||||
addDCValue("identifier", null, value.toString());
|
||||
} else if ("ISSN".equals(qName)) {
|
||||
addDCValue("identifier", "issn", value.toString());
|
||||
} else if ("ArticleTitle".equals(qName)) {
|
||||
addDCValue("title", null, value.toString());
|
||||
} else if ("AbstractText".equals(qName)) {
|
||||
addDCValue("description", "abstract", value.toString());
|
||||
} else if ("PublicationType".equals(qName)) {
|
||||
addDCValue("type", null, value.toString());
|
||||
} else if ("Author".equals(qName)) {
|
||||
addDCValue("contributor", "author", lastName + ", " + firstName);
|
||||
} else if ("DescriptorName".equals(qName)) {
|
||||
addDCValue("subject", "mesh", value.toString());
|
||||
}
|
||||
} else {
|
||||
if ("MedlineCitation".equals(qName)) {
|
||||
isCorrection = false;
|
||||
}
|
||||
}
|
||||
|
||||
if ("PubmedArticle".equals(qName)) {
|
||||
try {
|
||||
writeItem();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalStateException("Unable to export record", e);
|
||||
}
|
||||
System.out.println("Ending record " + recordCount);
|
||||
recordCount++;
|
||||
}
|
||||
|
||||
isFirstName = false;
|
||||
isLastName = false;
|
||||
super.endElement(uri, localName, qName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] chars, int start, int length) throws SAXException
|
||||
{
|
||||
if (isFirstName) {
|
||||
firstName.append(chars, start, length);
|
||||
// firstName = String.copyValueOf(chars, start, length);
|
||||
} else if (isLastName) {
|
||||
lastName.append(chars, start, length);
|
||||
// lastName = String.copyValueOf(chars, start, length);
|
||||
} else {
|
||||
value.append(chars, start, length);
|
||||
// value = String.copyValueOf(chars, start, length);
|
||||
}
|
||||
|
||||
super.characters(chars, start, length);
|
||||
}
|
||||
|
||||
private void writeItem() throws IOException
|
||||
{
|
||||
File itemDir = new File(outputDir, String.valueOf(recordCount));
|
||||
itemDir.mkdirs();
|
||||
|
||||
new File(itemDir, "contents").createNewFile();
|
||||
|
||||
Document doc = new Document();
|
||||
Element root = new Element("dublin_core");
|
||||
|
||||
doc.setRootElement(root);
|
||||
|
||||
for (DCValue dcValue : dcValues)
|
||||
{
|
||||
Element dcNode = new Element("dcvalue");
|
||||
|
||||
dcNode.setAttribute("element", dcValue.element);
|
||||
|
||||
if (!StringUtils.isEmpty(dcValue.qualifier))
|
||||
{
|
||||
dcNode.setAttribute("qualifier", dcValue.qualifier);
|
||||
}
|
||||
|
||||
dcNode.setText(dcValue.value);
|
||||
|
||||
root.addContent(dcNode);
|
||||
}
|
||||
|
||||
|
||||
File dc = new File(itemDir, "dublin_core.xml");
|
||||
XMLOutputter dcOutput = new XMLOutputter(Format.getPrettyFormat().setEncoding("UTF-8"));
|
||||
OutputStream out = null;
|
||||
try {
|
||||
out = new BufferedOutputStream(new FileOutputStream(dc));
|
||||
dcOutput.output(doc, out);
|
||||
} finally {
|
||||
if (out != null) {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
dcValues.clear();
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user