DS-2952: Use a SequenceInputStream to add the content of multiple full text bitstreams to SOLR

This commit is contained in:
Tom Desair
2016-12-28 23:47:12 +01:00
parent 1137d4562c
commit 3b2d8f3669
4 changed files with 208 additions and 135 deletions

View File

@@ -16,16 +16,13 @@ import java.math.BigInteger;
import java.rmi.dgc.VMID;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Random;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Date;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.text.SimpleDateFormat;
import java.text.ParseException;
import com.coverity.security.Escape;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
/**
@@ -413,4 +410,8 @@ public final class Utils
int rl = result.length();
return result.substring(0, rl-2) + ":" + result.substring(rl-2);
}
public static <E> Collection<E> emptyIfNull(Collection<E> collection) {
return collection == null ? Collections.<E>emptyList() : collection;
}
}

View File

@@ -1,81 +0,0 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.discovery;
import org.apache.log4j.Logger;
import org.apache.solr.common.util.ContentStreamBase;
import org.dspace.content.Bitstream;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamService;
import org.dspace.core.Context;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
/**
* Construct a <code>ContentStream</code> from a <code>File</code>
*/
public class BitstreamContentStream extends ContentStreamBase
{
private static final Logger log = Logger.getLogger(BitstreamContentStream.class);
protected final Context context;
protected final Bitstream file;
protected BitstreamService bitstreamService;
public BitstreamContentStream(Context context, Bitstream f ) throws SQLException {
file = f;
this.context = context;
contentType = f.getFormat(context).getMIMEType();
name = file.getName();
size = file.getSize();
sourceInfo = file.getName();
bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
}
@Override
public String getContentType() {
if(contentType==null) {
InputStream stream = null;
try {
stream = bitstreamService.retrieve(context, file);
char first = (char)stream.read();
if(first == '<') {
return "application/xml";
}
if(first == '{') {
return "application/json";
}
} catch(Exception ex) {
log.error("Error determining content type for bitstream:" + file.getID(), ex);
} finally {
if (stream != null) try {
stream.close();
} catch (IOException ioe) {
log.error("Error closing stream:" + file.getID(), ioe);
}
}
}
return contentType;
}
@Override
public InputStream getStream() throws IOException {
try {
return bitstreamService.retrieve(context, file);
} catch (Exception e) {
log.error(e.getMessage(),e);
return new ByteArrayInputStream(e.getMessage().getBytes(StandardCharsets.UTF_8));
}
}
}

View File

@@ -0,0 +1,197 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.discovery;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.solr.common.util.ContentStreamBase;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamService;
import org.dspace.core.Context;
import javax.annotation.Nullable;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import static org.dspace.core.Utils.emptyIfNull;
/**
* Construct a <code>ContentStream</code> from a <code>File</code>
*/
public class FullTextContentStreams extends ContentStreamBase
{
private static final Logger log = Logger.getLogger(FullTextContentStreams.class);
public static final String FULLTEXT_BUNDLE = "TEXT";
protected final Context context;
protected final List<FullTextBitstream> fullTextStreams;
protected BitstreamService bitstreamService;
public FullTextContentStreams(Context context, Item parentItem) throws SQLException {
this.context = context;
fullTextStreams = new LinkedList<>();
sourceInfo = parentItem.getHandle();
bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
//extracted full text is always extracted as plain text
contentType = "text/plain";
buildFullTextList(parentItem);
}
private void buildFullTextList(Item parentItem) {
// now get full text of any bitstreams in the TEXT bundle
// trundle through the bundles
List<Bundle> myBundles = parentItem.getBundles();
for (Bundle myBundle : emptyIfNull(myBundles)) {
if (StringUtils.equals(FULLTEXT_BUNDLE, myBundle.getName())) {
// a-ha! grab the text out of the bitstreams
List<Bitstream> bitstreams = myBundle.getBitstreams();
for (Bitstream fulltextBitstream : emptyIfNull(bitstreams)) {
fullTextStreams.add(new FullTextBitstream(sourceInfo, fulltextBitstream));
log.debug("Added BitStream: "
+ fulltextBitstream.getStoreNumber() + " "
+ fulltextBitstream.getSequenceID() + " "
+ fulltextBitstream.getName());
}
}
}
}
@Override
public String getName() {
return StringUtils.join(Iterables.transform(fullTextStreams, new Function<FullTextBitstream, String>() {
@Nullable
@Override
public String apply(@Nullable FullTextBitstream input) {
return input == null ? "" : input.getFileName();
}
}), ";");
}
@Override
public Long getSize() {
long result = 0;
if(CollectionUtils.isNotEmpty(fullTextStreams)) {
Iterable<Long> individualSizes = Iterables.transform(fullTextStreams, new Function<FullTextBitstream, Long>() {
@Nullable
@Override
public Long apply(@Nullable FullTextBitstream input) {
return input == null ? 0L : input.getSize();
}
});
for (Long size : individualSizes) {
result += size;
}
}
return result;
}
@Override
public Reader getReader() throws IOException {
return super.getReader();
}
@Override
public InputStream getStream() throws IOException {
try {
return new SequenceInputStream(new FullTextEnumeration(fullTextStreams.iterator()));
} catch (Exception e) {
log.error("Unable to add full text bitstreams to SOLR for item " + sourceInfo + ": " + e.getMessage(), e);
return new ByteArrayInputStream(e.getMessage().getBytes(StandardCharsets.UTF_8));
}
}
public boolean isEmpty() {
return CollectionUtils.isEmpty(fullTextStreams);
}
private class FullTextBitstream {
private String itemHandle;
private Bitstream bitstream;
public FullTextBitstream(final String parentHandle, final Bitstream file) {
this.itemHandle = parentHandle;
this.bitstream = file;
}
public String getContentType(final Context context) throws SQLException {
BitstreamFormat format = bitstream.getFormat(context);
return format == null ? null : StringUtils.trimToEmpty(format.getMIMEType());
}
public String getFileName() {
return StringUtils.trimToEmpty(bitstream.getName());
}
public long getSize() {
return bitstream.getSize();
}
public InputStream getInputStream() throws SQLException, IOException, AuthorizeException {
return bitstreamService.retrieve(context, bitstream);
}
public String getItemHandle() {
return itemHandle;
}
}
private class FullTextEnumeration implements Enumeration<InputStream> {
private final Iterator<FullTextBitstream> fulltextIterator;
public FullTextEnumeration(final Iterator<FullTextBitstream> fulltextStreams) {
this.fulltextIterator = fulltextStreams;
}
public boolean hasMoreElements() {
return fulltextIterator.hasNext();
}
public InputStream nextElement() {
InputStream inputStream;
FullTextBitstream bitstream = null;
try {
bitstream = fulltextIterator.next();
inputStream = bitstream.getInputStream();
} catch (Exception e) {
log.warn("Unable to add full text bitstream " + (bitstream == null ? "NULL" :
bitstream.getFileName() + " for item " + bitstream.getItemHandle())
+ " to SOLR:" + e.getMessage(), e);
inputStream = new ByteArrayInputStream(e.getMessage().getBytes(StandardCharsets.UTF_8));
}
return inputStream;
}
}
}

View File

@@ -770,19 +770,15 @@ public class SolrServiceImpl implements SearchService, IndexingService {
* @throws IOException
* A general class of exceptions produced by failed or interrupted I/O operations.
*/
protected void writeDocument(SolrInputDocument doc, List<BitstreamContentStream> streams) throws IOException {
protected void writeDocument(SolrInputDocument doc, FullTextContentStreams streams) throws IOException {
try {
if (getSolr() != null)
{
if (CollectionUtils.isNotEmpty(streams))
if (streams != null && !streams.isEmpty())
{
ContentStreamUpdateRequest req = new ContentStreamUpdateRequest("/update/extract");
for(BitstreamContentStream bce : streams)
{
req.addContentStream(bce);
}
req.addContentStream(streams);
ModifiableSolrParams params = new ModifiableSolrParams();
@@ -1421,47 +1417,7 @@ public class SolrServiceImpl implements SearchService, IndexingService {
log.debug(" Added Grouping");
List<BitstreamContentStream> streams = new ArrayList<BitstreamContentStream>();
try {
// now get full text of any bitstreams in the TEXT bundle
// trundle through the bundles
List<Bundle> myBundles = item.getBundles();
for (Bundle myBundle : myBundles)
{
if ((myBundle.getName() != null)
&& myBundle.getName().equals("TEXT"))
{
// a-ha! grab the text out of the bitstreams
List<Bitstream> bitstreams = myBundle.getBitstreams();
for (Bitstream myBitstream : bitstreams)
{
try {
streams.add(new BitstreamContentStream(context, myBitstream));
log.debug(" Added BitStream: "
+ myBitstream.getStoreNumber() + " "
+ myBitstream.getSequenceID() + " "
+ myBitstream.getName());
} catch (Exception e)
{
// this will never happen, but compiler is now
// happy.
log.trace(e.getMessage(), e);
}
}
}
}
} catch (RuntimeException e)
{
log.error(e.getMessage(), e);
}
FullTextContentStreams textContentStreams = new FullTextContentStreams(context, item);
//Do any additional indexing, depends on the plugins
List<SolrServiceIndexPlugin> solrServiceIndexPlugins = DSpaceServicesFactory.getInstance().getServiceManager().getServicesByType(SolrServiceIndexPlugin.class);
@@ -1472,7 +1428,7 @@ public class SolrServiceImpl implements SearchService, IndexingService {
// write the index and close the inputstreamreaders
try {
writeDocument(doc, streams);
writeDocument(doc, textContentStreams);
log.info("Wrote Item: " + handle + " to Index");
} catch (RuntimeException e)
{