mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-17 15:03:18 +00:00

Contribution from CILEA funded by the Hub project from HKU (http://hub.hku.hk) Faceting, filtering (autocomplete), sidebar facet for the site home page, community and collections are all implemented. Changes to the Discovery API/configuration: 1) changed the unique field for the SOLR document, now is used the concatenation of ID and TYPE-ID (in future we want to index also object that have not an handle) 2) the prune query has been changed in search.resourcetype:[2 TO 4] so to not remove eventually extra data loaded in the SOLR search core 3) added defaultRpp parameter Main differences from the XMLUI implementation: 1) facets doesn't have a "...More" link but there are pagination to scroll facet in the context (search, home page, community, etc.) 2) facets doesn't show the values already selected 3) autocomplete is done against user input and does not dump all the values (this was a performance issue in XMLUI < 3.0, with 90k items I see JSON around 2Mb). With the new Discovery improvements the autocomplete feature in XMLUI seems to be turned off 4) to enable JSPUI discovery you need to add some extra plugins in dspace.cfg (instructions are provided as comment in the discovery.cfg)
497 lines
16 KiB
Java
497 lines
16 KiB
Java
/**
|
|
* The contents of this file are subject to the license and copyright
|
|
* detailed in the LICENSE and NOTICE files at the root of the source
|
|
* tree and available online at
|
|
*
|
|
* http://www.dspace.org/license/
|
|
*/
|
|
package org.dspace.search;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.Date;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
|
|
import org.apache.log4j.Logger;
|
|
import org.apache.lucene.document.Document;
|
|
import org.apache.lucene.index.IndexReader;
|
|
import org.apache.lucene.queryParser.ParseException;
|
|
import org.apache.lucene.queryParser.QueryParser;
|
|
import org.apache.lucene.queryParser.TokenMgrError;
|
|
import org.apache.lucene.search.BooleanQuery;
|
|
import org.apache.lucene.search.IndexSearcher;
|
|
import org.apache.lucene.search.Query;
|
|
import org.apache.lucene.search.Sort;
|
|
import org.apache.lucene.search.SortField;
|
|
import org.apache.lucene.search.TopDocs;
|
|
import org.apache.lucene.store.Directory;
|
|
import org.apache.lucene.store.FSDirectory;
|
|
import org.apache.lucene.util.Version;
|
|
import org.dspace.content.Collection;
|
|
import org.dspace.content.Community;
|
|
import org.dspace.core.ConfigurationManager;
|
|
import org.dspace.core.Constants;
|
|
import org.dspace.core.Context;
|
|
import org.dspace.core.LogManager;
|
|
import org.dspace.sort.SortOption;
|
|
|
|
// issues
|
|
// need to filter query string for security
|
|
// cmd line query needs to process args correctly (seems to split them up)
|
|
/**
|
|
* DSIndexer contains various static methods for performing queries on indices,
|
|
* for collections and communities.
|
|
*
|
|
*/
|
|
public class DSQuery
|
|
{
|
|
// Result types
|
|
static final String ALL = "999";
|
|
|
|
static final String ITEM = "" + Constants.ITEM;
|
|
|
|
static final String COLLECTION = "" + Constants.COLLECTION;
|
|
|
|
static final String COMMUNITY = "" + Constants.COMMUNITY;
|
|
|
|
// cache a Lucene IndexSearcher for more efficient searches
|
|
private static IndexSearcher searcher = null;
|
|
|
|
private static String indexDir = null;
|
|
|
|
private static String operator = null;
|
|
|
|
private static long lastModified;
|
|
|
|
/** log4j logger */
|
|
private static Logger log = Logger.getLogger(DSQuery.class);
|
|
|
|
|
|
static
|
|
{
|
|
String maxClauses = ConfigurationManager.getProperty("search.max-clauses");
|
|
if (maxClauses != null)
|
|
{
|
|
BooleanQuery.setMaxClauseCount(Integer.parseInt(maxClauses));
|
|
}
|
|
|
|
indexDir = ConfigurationManager.getProperty("search.dir");
|
|
|
|
operator = ConfigurationManager.getProperty("search.operator");
|
|
}
|
|
|
|
/**
|
|
* Do a query, returning a QueryResults object
|
|
*
|
|
* @param c context
|
|
* @param args query arguments in QueryArgs object
|
|
*
|
|
* @return query results QueryResults
|
|
*/
|
|
public static QueryResults doQuery(Context c, QueryArgs args)
|
|
throws IOException
|
|
{
|
|
String querystring = args.getQuery();
|
|
QueryResults qr = new QueryResults();
|
|
List<String> hitHandles = new ArrayList<String>();
|
|
List<Integer> hitIds = new ArrayList<Integer>();
|
|
List<Integer> hitTypes = new ArrayList<Integer>();
|
|
|
|
// set up the QueryResults object
|
|
qr.setHitHandles(hitHandles);
|
|
qr.setHitIds(hitIds);
|
|
qr.setHitTypes(hitTypes);
|
|
qr.setStart(args.getStart());
|
|
qr.setPageSize(args.getPageSize());
|
|
qr.setEtAl(args.getEtAl());
|
|
|
|
// massage the query string a bit
|
|
querystring = checkEmptyQuery(querystring); // change nulls to an empty string
|
|
// We no longer need to work around the Lucene bug with recent versions
|
|
//querystring = workAroundLuceneBug(querystring); // logicals changed to && ||, etc.
|
|
querystring = stripHandles(querystring); // remove handles from query string
|
|
querystring = stripAsterisk(querystring); // remove asterisk from beginning of string
|
|
|
|
try
|
|
{
|
|
// calculate execution time
|
|
Date startTime = new Date();
|
|
|
|
// grab a searcher, and do the search
|
|
IndexSearcher searcher = getSearcher(c);
|
|
|
|
QueryParser qp = new QueryParser(Version.LUCENE_33, "default", DSIndexer.getAnalyzer());
|
|
log.debug("Final query string: " + querystring);
|
|
|
|
if (operator == null || operator.equals("OR"))
|
|
{
|
|
qp.setDefaultOperator(QueryParser.OR_OPERATOR);
|
|
}
|
|
else
|
|
{
|
|
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
|
}
|
|
|
|
Query myquery = qp.parse(querystring);
|
|
//Retrieve enough docs to get all the results we need !
|
|
TopDocs hits = performQuery(args, searcher, myquery, args.getPageSize() * (args.getStart() + 1));
|
|
|
|
Date endTime = new Date();
|
|
|
|
qr.setQueryTime(endTime.getTime() - startTime.getTime());
|
|
|
|
// set total number of hits
|
|
qr.setHitCount(hits.totalHits);
|
|
|
|
// We now have a bunch of hits - snip out a 'window'
|
|
// defined in start, count and return the handles
|
|
// from that window
|
|
// first, are there enough hits?
|
|
if (args.getStart() < hits.totalHits)
|
|
{
|
|
// get as many as we can, up to the window size
|
|
// how many are available after snipping off at offset 'start'?
|
|
int hitsRemaining = hits.totalHits - args.getStart();
|
|
|
|
int hitsToProcess = (hitsRemaining < args.getPageSize()) ? hitsRemaining
|
|
: args.getPageSize();
|
|
|
|
for (int i = args.getStart(); i < (args.getStart() + hitsToProcess); i++)
|
|
{
|
|
Document d = searcher.doc(hits.scoreDocs[i].doc);
|
|
|
|
String resourceId = d.get("search.resourceid");
|
|
String resourceType = d.get("search.resourcetype");
|
|
|
|
String handleText = d.get("handle");
|
|
String handleType = d.get("type");
|
|
|
|
switch (Integer.parseInt( resourceType != null ? resourceType : handleType))
|
|
{
|
|
case Constants.ITEM:
|
|
hitTypes.add(Constants.ITEM);
|
|
break;
|
|
|
|
case Constants.COLLECTION:
|
|
hitTypes.add(Constants.COLLECTION);
|
|
break;
|
|
|
|
case Constants.COMMUNITY:
|
|
hitTypes.add(Constants.COMMUNITY);
|
|
break;
|
|
}
|
|
|
|
hitHandles.add( handleText );
|
|
hitIds.add( resourceId == null ? null: Integer.parseInt(resourceId) );
|
|
}
|
|
}
|
|
}
|
|
catch (NumberFormatException e)
|
|
{
|
|
log.warn(LogManager.getHeader(c, "Number format exception", "" + e));
|
|
qr.setErrorMsg("number-format-exception");
|
|
}
|
|
catch (ParseException e)
|
|
{
|
|
// a parse exception - log and return null results
|
|
log.warn(LogManager.getHeader(c, "Invalid search string", "" + e));
|
|
qr.setErrorMsg("invalid-search-string");
|
|
}
|
|
catch (TokenMgrError tme)
|
|
{
|
|
// Similar to parse exception
|
|
log.warn(LogManager.getHeader(c, "Invalid search string", "" + tme));
|
|
qr.setErrorMsg("invalid-search-string");
|
|
}
|
|
catch(BooleanQuery.TooManyClauses e)
|
|
{
|
|
log.warn(LogManager.getHeader(c, "Query too broad", e.toString()));
|
|
qr.setErrorMsg("query-too-broad");
|
|
}
|
|
|
|
return qr;
|
|
}
|
|
|
|
private static TopDocs performQuery(QueryArgs args, IndexSearcher searcher, Query myquery, int max) throws IOException {
|
|
TopDocs hits;
|
|
try
|
|
{
|
|
if (args.getSortOption() == null)
|
|
{
|
|
SortField[] sortFields = new SortField[] {
|
|
new SortField("search.resourcetype", SortField.INT, true),
|
|
new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder()))
|
|
};
|
|
hits = searcher.search(myquery, max, new Sort(sortFields));
|
|
}
|
|
else
|
|
{
|
|
SortField[] sortFields = new SortField[] {
|
|
new SortField("search.resourcetype", SortField.INT, true),
|
|
new SortField("sort_" + args.getSortOption().getName(), SortField.STRING, SortOption.DESCENDING.equals(args.getSortOrder())),
|
|
SortField.FIELD_SCORE
|
|
};
|
|
hits = searcher.search(myquery, max, new Sort(sortFields));
|
|
}
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
// Lucene can throw an exception if it is unable to determine a sort time from the specified field
|
|
// Provide a fall back that just works on relevancy.
|
|
log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName()));
|
|
hits = searcher.search(myquery, max, new Sort(SortField.FIELD_SCORE));
|
|
}
|
|
return hits;
|
|
}
|
|
|
|
static String checkEmptyQuery(String myquery)
|
|
{
|
|
if (myquery == null || myquery.equals("()") || myquery.equals(""))
|
|
{
|
|
myquery = "empty_query_string";
|
|
}
|
|
|
|
return myquery;
|
|
}
|
|
|
|
/**
|
|
* Workaround Lucene bug that breaks wildcard searching.
|
|
* This is no longer required with Lucene upgrades.
|
|
*
|
|
* @param myquery
|
|
* @return
|
|
* @deprecated
|
|
*/
|
|
static String workAroundLuceneBug(String myquery)
|
|
{
|
|
// Lucene currently has a bug which breaks wildcard
|
|
// searching when you have uppercase characters.
|
|
// Here we substitute the boolean operators -- which
|
|
// have to be uppercase -- before transforming the
|
|
// query string to lowercase.
|
|
return myquery.replaceAll(" AND ", " && ")
|
|
.replaceAll(" OR ", " || ")
|
|
.replaceAll(" NOT ", " ! ")
|
|
.toLowerCase();
|
|
}
|
|
|
|
static String stripHandles(String myquery)
|
|
{
|
|
// Drop beginning pieces of full handle strings
|
|
return myquery.replaceAll("^\\s*http://hdl\\.handle\\.net/", "")
|
|
.replaceAll("^\\s*hdl:", "");
|
|
}
|
|
|
|
static String stripAsterisk(String myquery)
|
|
{
|
|
// query strings (or words) beginning with "*" cause a null pointer error
|
|
return myquery.replaceAll("^\\*", "")
|
|
.replaceAll("\\s\\*", " ")
|
|
.replaceAll("\\(\\*", "(")
|
|
.replaceAll(":\\*", ":");
|
|
}
|
|
|
|
/**
|
|
* Do a query, restricted to a collection
|
|
*
|
|
* @param c
|
|
* context
|
|
* @param args
|
|
* query args
|
|
* @param coll
|
|
* collection to restrict to
|
|
*
|
|
* @return QueryResults same results as doQuery, restricted to a collection
|
|
*/
|
|
public static QueryResults doQuery(Context c, QueryArgs args,
|
|
Collection coll) throws IOException
|
|
{
|
|
String querystring = args.getQuery();
|
|
|
|
querystring = checkEmptyQuery(querystring);
|
|
|
|
String location = "l" + (coll.getID());
|
|
|
|
String newquery = "+(" + querystring + ") +location:\"" + location + "\"";
|
|
|
|
args.setQuery(newquery);
|
|
|
|
return doQuery(c, args);
|
|
}
|
|
|
|
/**
|
|
* Do a query, restricted to a community
|
|
*
|
|
* @param c
|
|
* context
|
|
* @param args
|
|
* query args
|
|
* @param comm
|
|
* community to restrict to
|
|
*
|
|
* @return QueryResults same results as doQuery, restricted to a collection
|
|
*/
|
|
public static QueryResults doQuery(Context c, QueryArgs args, Community comm)
|
|
throws IOException
|
|
{
|
|
String querystring = args.getQuery();
|
|
|
|
querystring = checkEmptyQuery(querystring);
|
|
|
|
String location = "m" + (comm.getID());
|
|
|
|
String newquery = "+(" + querystring + ") +location:\"" + location + "\"";
|
|
|
|
args.setQuery(newquery);
|
|
|
|
return doQuery(c, args);
|
|
}
|
|
|
|
|
|
/**
|
|
* Do a query, printing results to stdout largely for testing, but it is
|
|
* useful
|
|
*/
|
|
public static void doCMDLineQuery(String query)
|
|
{
|
|
System.out.println("Command line query: " + query);
|
|
System.out.println("Only reporting default-sized results list");
|
|
|
|
try
|
|
{
|
|
Context c = new Context();
|
|
|
|
QueryArgs args = new QueryArgs();
|
|
args.setQuery(query);
|
|
|
|
QueryResults results = doQuery(c, args);
|
|
|
|
Iterator i = results.getHitHandles().iterator();
|
|
Iterator j = results.getHitTypes().iterator();
|
|
|
|
while (i.hasNext())
|
|
{
|
|
String thisHandle = (String) i.next();
|
|
Integer thisType = (Integer) j.next();
|
|
String type = Constants.typeText[thisType];
|
|
|
|
// also look up type
|
|
System.out.println(type + "\t" + thisHandle);
|
|
}
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
System.out.println("Exception caught: " + e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Close any IndexSearcher that is currently open.
|
|
*/
|
|
public static synchronized void close()
|
|
{
|
|
if (searcher != null)
|
|
{
|
|
try
|
|
{
|
|
searcher.close();
|
|
searcher = null;
|
|
}
|
|
catch (IOException ioe)
|
|
{
|
|
log.error("DSQuery: Unable to close open IndexSearcher", ioe);
|
|
}
|
|
}
|
|
}
|
|
|
|
public static void main(String[] args)
|
|
{
|
|
if (args.length > 0)
|
|
{
|
|
DSQuery.doCMDLineQuery(args[0]);
|
|
}
|
|
}
|
|
|
|
/*--------- protected methods ----------*/
|
|
|
|
/**
|
|
* get an IndexReader.
|
|
* @throws IOException
|
|
*/
|
|
protected static IndexReader getIndexReader()
|
|
throws IOException
|
|
{
|
|
return getSearcher(null).getIndexReader();
|
|
}
|
|
|
|
/**
|
|
* get an IndexSearcher, hopefully a cached one (gives much better
|
|
* performance.) checks to see if the index has been modified - if so, it
|
|
* creates a new IndexSearcher
|
|
*/
|
|
protected static synchronized IndexSearcher getSearcher(Context c)
|
|
throws IOException
|
|
{
|
|
|
|
// If we have already opened a searcher, check to see if the index has been updated
|
|
// If it has, we need to close the existing searcher - we will open a new one later
|
|
|
|
Directory searchDir = FSDirectory.open(new File(indexDir));
|
|
|
|
if (searcher != null && lastModified != IndexReader.getCurrentVersion(searchDir))
|
|
{
|
|
try
|
|
{
|
|
// Close the cached IndexSearcher
|
|
searcher.close();
|
|
}
|
|
catch (IOException ioe)
|
|
{
|
|
// Index is probably corrupt. Log the error, but continue to either:
|
|
// 1) Return existing searcher (may yet throw exception, no worse than throwing here)
|
|
log.warn("DSQuery: Unable to check for updated index", ioe);
|
|
}
|
|
finally
|
|
{
|
|
searcher = null;
|
|
}
|
|
}
|
|
|
|
// There is no existing searcher - either this is the first execution,
|
|
// or the index has been updated and we closed the old index.
|
|
if (searcher == null)
|
|
{
|
|
// So, open a new searcher
|
|
lastModified = IndexReader.getCurrentVersion(searchDir);
|
|
String osName = System.getProperty("os.name");
|
|
if (osName != null && osName.toLowerCase().contains("windows"))
|
|
{
|
|
searcher = new IndexSearcher(searchDir){
|
|
/*
|
|
* TODO: Has Lucene fixed this bug yet?
|
|
* Lucene doesn't release read locks in
|
|
* windows properly on finalize. Our hack
|
|
* extend IndexSearcher to force close().
|
|
*/
|
|
@Override
|
|
protected void finalize() throws Throwable {
|
|
this.close();
|
|
super.finalize();
|
|
}
|
|
};
|
|
}
|
|
else
|
|
{
|
|
searcher = new IndexSearcher(searchDir);
|
|
}
|
|
}
|
|
|
|
return searcher;
|
|
}
|
|
}
|
|
|
|
// it's now up to the display page to do the right thing displaying
|
|
// items & communities & collections
|