mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-16 06:23:10 +00:00
DS-2952: Use a SequenceInputStream to add the content of multiple full text bitstreams to SOLR
This commit is contained in:
@@ -0,0 +1,197 @@
|
||||
/**
|
||||
* The contents of this file are subject to the license and copyright
|
||||
* detailed in the LICENSE and NOTICE files at the root of the source
|
||||
* tree and available online at
|
||||
*
|
||||
* http://www.dspace.org/license/
|
||||
*/
|
||||
package org.dspace.discovery;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Iterables;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.solr.common.util.ContentStreamBase;
|
||||
import org.dspace.authorize.AuthorizeException;
|
||||
import org.dspace.content.Bitstream;
|
||||
import org.dspace.content.BitstreamFormat;
|
||||
import org.dspace.content.Bundle;
|
||||
import org.dspace.content.Item;
|
||||
import org.dspace.content.factory.ContentServiceFactory;
|
||||
import org.dspace.content.service.BitstreamService;
|
||||
import org.dspace.core.Context;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.dspace.core.Utils.emptyIfNull;
|
||||
|
||||
/**
|
||||
* Construct a <code>ContentStream</code> from a <code>File</code>
|
||||
*/
|
||||
public class FullTextContentStreams extends ContentStreamBase
|
||||
{
|
||||
private static final Logger log = Logger.getLogger(FullTextContentStreams.class);
|
||||
|
||||
public static final String FULLTEXT_BUNDLE = "TEXT";
|
||||
|
||||
protected final Context context;
|
||||
protected final List<FullTextBitstream> fullTextStreams;
|
||||
protected BitstreamService bitstreamService;
|
||||
|
||||
public FullTextContentStreams(Context context, Item parentItem) throws SQLException {
|
||||
this.context = context;
|
||||
fullTextStreams = new LinkedList<>();
|
||||
sourceInfo = parentItem.getHandle();
|
||||
bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
|
||||
|
||||
//extracted full text is always extracted as plain text
|
||||
contentType = "text/plain";
|
||||
|
||||
buildFullTextList(parentItem);
|
||||
}
|
||||
|
||||
private void buildFullTextList(Item parentItem) {
|
||||
// now get full text of any bitstreams in the TEXT bundle
|
||||
// trundle through the bundles
|
||||
List<Bundle> myBundles = parentItem.getBundles();
|
||||
|
||||
for (Bundle myBundle : emptyIfNull(myBundles)) {
|
||||
if (StringUtils.equals(FULLTEXT_BUNDLE, myBundle.getName())) {
|
||||
// a-ha! grab the text out of the bitstreams
|
||||
List<Bitstream> bitstreams = myBundle.getBitstreams();
|
||||
|
||||
for (Bitstream fulltextBitstream : emptyIfNull(bitstreams)) {
|
||||
fullTextStreams.add(new FullTextBitstream(sourceInfo, fulltextBitstream));
|
||||
|
||||
log.debug("Added BitStream: "
|
||||
+ fulltextBitstream.getStoreNumber() + " "
|
||||
+ fulltextBitstream.getSequenceID() + " "
|
||||
+ fulltextBitstream.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return StringUtils.join(Iterables.transform(fullTextStreams, new Function<FullTextBitstream, String>() {
|
||||
@Nullable
|
||||
@Override
|
||||
public String apply(@Nullable FullTextBitstream input) {
|
||||
return input == null ? "" : input.getFileName();
|
||||
}
|
||||
}), ";");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long getSize() {
|
||||
long result = 0;
|
||||
|
||||
if(CollectionUtils.isNotEmpty(fullTextStreams)) {
|
||||
Iterable<Long> individualSizes = Iterables.transform(fullTextStreams, new Function<FullTextBitstream, Long>() {
|
||||
@Nullable
|
||||
@Override
|
||||
public Long apply(@Nullable FullTextBitstream input) {
|
||||
return input == null ? 0L : input.getSize();
|
||||
}
|
||||
});
|
||||
|
||||
for (Long size : individualSizes) {
|
||||
result += size;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader getReader() throws IOException {
|
||||
return super.getReader();
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
try {
|
||||
return new SequenceInputStream(new FullTextEnumeration(fullTextStreams.iterator()));
|
||||
} catch (Exception e) {
|
||||
log.error("Unable to add full text bitstreams to SOLR for item " + sourceInfo + ": " + e.getMessage(), e);
|
||||
return new ByteArrayInputStream(e.getMessage().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return CollectionUtils.isEmpty(fullTextStreams);
|
||||
}
|
||||
|
||||
private class FullTextBitstream {
|
||||
private String itemHandle;
|
||||
private Bitstream bitstream;
|
||||
|
||||
public FullTextBitstream(final String parentHandle, final Bitstream file) {
|
||||
this.itemHandle = parentHandle;
|
||||
this.bitstream = file;
|
||||
}
|
||||
|
||||
public String getContentType(final Context context) throws SQLException {
|
||||
BitstreamFormat format = bitstream.getFormat(context);
|
||||
return format == null ? null : StringUtils.trimToEmpty(format.getMIMEType());
|
||||
}
|
||||
|
||||
public String getFileName() {
|
||||
return StringUtils.trimToEmpty(bitstream.getName());
|
||||
}
|
||||
|
||||
public long getSize() {
|
||||
return bitstream.getSize();
|
||||
}
|
||||
|
||||
public InputStream getInputStream() throws SQLException, IOException, AuthorizeException {
|
||||
return bitstreamService.retrieve(context, bitstream);
|
||||
}
|
||||
|
||||
public String getItemHandle() {
|
||||
return itemHandle;
|
||||
}
|
||||
}
|
||||
|
||||
private class FullTextEnumeration implements Enumeration<InputStream> {
|
||||
|
||||
private final Iterator<FullTextBitstream> fulltextIterator;
|
||||
|
||||
public FullTextEnumeration(final Iterator<FullTextBitstream> fulltextStreams) {
|
||||
this.fulltextIterator = fulltextStreams;
|
||||
}
|
||||
|
||||
public boolean hasMoreElements() {
|
||||
return fulltextIterator.hasNext();
|
||||
}
|
||||
|
||||
public InputStream nextElement() {
|
||||
InputStream inputStream;
|
||||
|
||||
FullTextBitstream bitstream = null;
|
||||
try {
|
||||
bitstream = fulltextIterator.next();
|
||||
inputStream = bitstream.getInputStream();
|
||||
} catch (Exception e) {
|
||||
log.warn("Unable to add full text bitstream " + (bitstream == null ? "NULL" :
|
||||
bitstream.getFileName() + " for item " + bitstream.getItemHandle())
|
||||
+ " to SOLR:" + e.getMessage(), e);
|
||||
|
||||
inputStream = new ByteArrayInputStream(e.getMessage().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
return inputStream;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user