[DS-980] Upgraded solr & lucene to version 3.3.0

git-svn-id: http://scm.dspace.org/svn/repo/dspace/trunk@6545 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
Tim Donohue
2011-08-12 19:49:34 +00:00
parent cda73317df
commit e40a2c9441
9 changed files with 1016 additions and 1028 deletions

View File

@@ -16,6 +16,7 @@ import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.util.Version;
import org.dspace.core.ConfigurationManager;
/**
@@ -47,7 +48,7 @@ public class DSAnalyzer extends Analyzer
/*
* Stop table
*/
protected static final Set stopSet = StopFilter.makeStopSet(STOP_WORDS);
protected static final Set stopSet = StopFilter.makeStopSet(Version.LUCENE_33,STOP_WORDS);
/*
* Create a token stream for this analyzer.
@@ -59,7 +60,7 @@ public class DSAnalyzer extends Analyzer
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
result = new StopFilter(Version.LUCENE_33, result, stopSet);
result = new PorterStemFilter(result);
return result;

View File

@@ -39,8 +39,12 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.dspace.content.Bitstream;
import org.dspace.content.Bundle;
import org.dspace.content.Collection;
@@ -208,21 +212,22 @@ public class DSIndexer
/*
* Create the index directory if it doesn't already exist.
*/
if (!IndexReader.indexExists(indexDirectory))
{
try
try
{
if (!IndexReader.indexExists(FSDirectory.open(new File(indexDirectory))))
{
if (!new File(indexDirectory).mkdirs())
if (!new File(indexDirectory).mkdirs())
{
log.error("Unable to create index directory: " + indexDirectory);
}
openIndex(true).close();
}
catch (IOException e)
{
throw new IllegalStateException("Could not create search index: " + e.getMessage(),e);
openIndex(true).close();
}
}
}
catch (IOException e)
{
throw new IllegalStateException("Could not create search index: " + e.getMessage(),e);
}
}
public static void setBatchProcessingMode(boolean mode)
@@ -902,8 +907,15 @@ public class DSIndexer
private static IndexWriter openIndex(boolean wipeExisting)
throws IOException
{
IndexWriter writer = new IndexWriter(indexDirectory, getAnalyzer(), wipeExisting);
Directory dir = FSDirectory.open(new File(indexDirectory));
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_33, getAnalyzer());
if(wipeExisting){
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
}else{
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(dir, iwc);
/* Set maximum number of terms to index if present in dspace.cfg */
if (maxfieldlength == -1)
@@ -982,8 +994,8 @@ public class DSIndexer
if (name != null)
{
doc.add(new Field("name", name, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("default", name, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("name", name, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("default", name, Field.Store.NO, Field.Index.ANALYZED));
}
return doc;
@@ -1008,8 +1020,8 @@ public class DSIndexer
if (name != null)
{
doc.add(new Field("name", name, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("default", name, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("name", name, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("default", name, Field.Store.NO, Field.Index.ANALYZED));
}
return doc;
@@ -1062,12 +1074,12 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
DateTools.dateToString(d, DateTools.Resolution.SECOND),
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
doc.add( new Field(indexConfigArr[i].indexName + ".year",
DateTools.dateToString(d, DateTools.Resolution.YEAR),
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
}
}
else if ("date".equalsIgnoreCase(indexConfigArr[i].type))
@@ -1078,12 +1090,12 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
DateTools.dateToString(d, DateTools.Resolution.DAY),
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
doc.add( new Field(indexConfigArr[i].indexName + ".year",
DateTools.dateToString(d, DateTools.Resolution.YEAR),
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
}
}
else
@@ -1099,7 +1111,7 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName+"_authority",
mydc[j].authority,
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
boolean valueAlreadyIndexed = false;
if (variants != null)
@@ -1110,7 +1122,7 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
var,
Field.Store.NO,
Field.Index.TOKENIZED));
Field.Index.ANALYZED));
if (var.equals(mydc[j].value))
{
valueAlreadyIndexed = true;
@@ -1121,7 +1133,7 @@ public class DSIndexer
doc.add( new Field("default",
var,
Field.Store.NO,
Field.Index.TOKENIZED));
Field.Index.ANALYZED));
}
}
}
@@ -1132,7 +1144,7 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
mydc[j].value,
Field.Store.NO,
Field.Index.TOKENIZED));
Field.Index.ANALYZED));
}
}
else
@@ -1141,11 +1153,11 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
mydc[j].value,
Field.Store.NO,
Field.Index.TOKENIZED));
Field.Index.ANALYZED));
}
}
doc.add( new Field("default", mydc[j].value, Field.Store.NO, Field.Index.TOKENIZED));
doc.add( new Field("default", mydc[j].value, Field.Store.NO, Field.Index.ANALYZED));
}
}
}
@@ -1164,7 +1176,7 @@ public class DSIndexer
if (dcv.length > 0)
{
String value = OrderFormat.makeSortString(dcv[0].value, dcv[0].language, so.getType());
doc.add( new Field("sort_" + so.getName(), value, Field.Store.NO, Field.Index.UN_TOKENIZED) );
doc.add( new Field("sort_" + so.getName(), value, Field.Store.NO, Field.Index.NOT_ANALYZED) );
}
}
}
@@ -1230,15 +1242,15 @@ public class DSIndexer
// want to be able to check when last updated
// (not tokenized, but it is indexed)
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "archived", Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "archived", Field.Store.YES, Field.Index.NOT_ANALYZED));
// KEPT FOR BACKWARDS COMPATIBILITY
// do location, type, handle first
doc.add(new Field("type", Integer.toString(type), Field.Store.YES, Field.Index.NO));
// New fields to weaken the dependence on handles, and allow for faster list display
doc.add(new Field("search.resourcetype", Integer.toString(type), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("search.resourcetype", Integer.toString(type), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("search.resourceid", Integer.toString(id), Field.Store.YES, Field.Index.NO));
// want to be able to search for handle, so use keyword
@@ -1246,20 +1258,20 @@ public class DSIndexer
if (handle != null)
{
// ??? not sure what the "handletext" field is but it was there in writeItemIndex ???
doc.add(new Field("handletext", handle, Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("handletext", handle, Field.Store.YES, Field.Index.ANALYZED));
// want to be able to search for handle, so use keyword
// (not tokenized, but it is indexed)
doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.NOT_ANALYZED));
// add to full text index
doc.add(new Field("default", handle, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("default", handle, Field.Store.NO, Field.Index.ANALYZED));
}
if(location != null)
{
doc.add(new Field("location", location, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("default", location, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("location", location, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("default", location, Field.Store.NO, Field.Index.ANALYZED));
}
return doc;
@@ -1271,8 +1283,8 @@ public class DSIndexer
// want to be able to check when last updated
// (not tokenized, but it is indexed)
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "deleted", Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "deleted", Field.Store.YES, Field.Index.NOT_ANALYZED));
// Do not add any other fields, as we don't want to be able to find it - just check the last indexed time
@@ -1285,8 +1297,8 @@ public class DSIndexer
// want to be able to check when last updated
// (not tokenized, but it is indexed)
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "withdrawn", Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "withdrawn", Field.Store.YES, Field.Index.NOT_ANALYZED));
// Do not add any other fields, as we don't want to be able to find it - just check the last indexed time

View File

@@ -13,6 +13,7 @@ import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.util.Version;
/**
* Custom Lucene Analyzer that combines the standard filter, lowercase filter
@@ -32,7 +33,7 @@ public class DSNonStemmingAnalyzer extends DSAnalyzer
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
result = new StopFilter(Version.LUCENE_33, result, stopSet);
return result;
}

View File

@@ -7,6 +7,7 @@
*/
package org.dspace.search;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
@@ -19,12 +20,14 @@ import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.TokenMgrError;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.core.ConfigurationManager;
@@ -113,9 +116,9 @@ public class DSQuery
try
{
// grab a searcher, and do the search
Searcher searcher = getSearcher(c);
IndexSearcher searcher = getSearcher(c);
QueryParser qp = new QueryParser("default", DSIndexer.getAnalyzer());
QueryParser qp = new QueryParser(Version.LUCENE_33, "default", DSIndexer.getAnalyzer());
log.debug("Final query string: " + querystring);
if (operator == null || operator.equals("OR"))
@@ -126,57 +129,30 @@ public class DSQuery
{
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
}
Query myquery = qp.parse(querystring);
Hits hits = null;
try
{
if (args.getSortOption() == null)
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", true),
new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder()))
};
hits = searcher.search(myquery, new Sort(sortFields));
}
else
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", true),
new SortField("sort_" + args.getSortOption().getName(), SortOption.DESCENDING.equals(args.getSortOrder())),
SortField.FIELD_SCORE
};
hits = searcher.search(myquery, new Sort(sortFields));
}
}
catch (Exception e)
{
// Lucene can throw an exception if it is unable to determine a sort time from the specified field
// Provide a fall back that just works on relevancy.
log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName()));
hits = searcher.search(myquery, new Sort(SortField.FIELD_SCORE));
}
Query myquery = qp.parse(querystring);
//Retrieve enough docs to get all the results we need !
TopDocs hits = performQuery(args, searcher, myquery, args.getPageSize() * (args.getStart() + 1));
// set total number of hits
qr.setHitCount(hits.length());
qr.setHitCount(hits.totalHits);
// We now have a bunch of hits - snip out a 'window'
// defined in start, count and return the handles
// from that window
// first, are there enough hits?
if (args.getStart() < hits.length())
if (args.getStart() < hits.totalHits)
{
// get as many as we can, up to the window size
// how many are available after snipping off at offset 'start'?
int hitsRemaining = hits.length() - args.getStart();
int hitsRemaining = hits.totalHits - args.getStart();
int hitsToProcess = (hitsRemaining < args.getPageSize()) ? hitsRemaining
: args.getPageSize();
for (int i = args.getStart(); i < (args.getStart() + hitsToProcess); i++)
{
Document d = hits.doc(i);
Document d = searcher.doc(hits.scoreDocs[i].doc);
String resourceId = d.get("search.resourceid");
String resourceType = d.get("search.resourcetype");
@@ -187,15 +163,15 @@ public class DSQuery
switch (Integer.parseInt( resourceType != null ? resourceType : handleType))
{
case Constants.ITEM:
hitTypes.add(Integer.valueOf(Constants.ITEM));
hitTypes.add(Constants.ITEM);
break;
case Constants.COLLECTION:
hitTypes.add(Integer.valueOf(Constants.COLLECTION));
hitTypes.add(Constants.COLLECTION);
break;
case Constants.COMMUNITY:
hitTypes.add(Integer.valueOf(Constants.COMMUNITY));
hitTypes.add(Constants.COMMUNITY);
break;
}
@@ -230,6 +206,38 @@ public class DSQuery
return qr;
}
private static TopDocs performQuery(QueryArgs args, IndexSearcher searcher, Query myquery, int max) throws IOException {
TopDocs hits;
try
{
if (args.getSortOption() == null)
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", SortField.INT, true),
new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder()))
};
hits = searcher.search(myquery, max, new Sort(sortFields));
}
else
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", SortField.INT, true),
new SortField("sort_" + args.getSortOption().getName(), SortField.STRING, SortOption.DESCENDING.equals(args.getSortOrder())),
SortField.FIELD_SCORE
};
hits = searcher.search(myquery, max, new Sort(sortFields));
}
}
catch (Exception e)
{
// Lucene can throw an exception if it is unable to determine a sort time from the specified field
// Provide a fall back that just works on relevancy.
log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName()));
hits = searcher.search(myquery, max, new Sort(SortField.FIELD_SCORE));
}
return hits;
}
static String checkEmptyQuery(String myquery)
{
if (myquery == null || myquery.equals("()") || myquery.equals(""))
@@ -359,7 +367,7 @@ public class DSQuery
{
String thisHandle = (String) i.next();
Integer thisType = (Integer) j.next();
String type = Constants.typeText[thisType.intValue()];
String type = Constants.typeText[thisType];
// also look up type
System.out.println(type + "\t" + thisHandle);
@@ -421,7 +429,10 @@ public class DSQuery
// If we have already opened a searcher, check to see if the index has been updated
// If it has, we need to close the existing searcher - we will open a new one later
if (searcher != null && lastModified != IndexReader.getCurrentVersion(indexDir))
Directory searchDir = FSDirectory.open(new File(indexDir));
if (searcher != null && lastModified != IndexReader.getCurrentVersion(searchDir))
{
try
{
@@ -445,17 +456,18 @@ public class DSQuery
if (searcher == null)
{
// So, open a new searcher
lastModified = IndexReader.getCurrentVersion(indexDir);
lastModified = IndexReader.getCurrentVersion(searchDir);
String osName = System.getProperty("os.name");
if (osName != null && osName.toLowerCase().contains("windows"))
{
searcher = new IndexSearcher(indexDir){
searcher = new IndexSearcher(searchDir){
/*
* TODO: Has Lucene fixed this bug yet?
* Lucene doesn't release read locks in
* windows properly on finalize. Our hack
* extend IndexSearcher to force close().
*/
@Override
protected void finalize() throws Throwable {
this.close();
super.finalize();
@@ -464,7 +476,7 @@ public class DSQuery
}
else
{
searcher = new IndexSearcher(indexDir);
searcher = new IndexSearcher(searchDir);
}
}