[DS-980] Upgraded solr & lucene to version 3.3.0

git-svn-id: http://scm.dspace.org/svn/repo/dspace/trunk@6545 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
Tim Donohue
2011-08-12 19:49:34 +00:00
parent cda73317df
commit e40a2c9441
9 changed files with 1016 additions and 1028 deletions

View File

@@ -16,6 +16,7 @@ import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.util.Version;
import org.dspace.core.ConfigurationManager;
/**
@@ -47,7 +48,7 @@ public class DSAnalyzer extends Analyzer
/*
* Stop table
*/
protected static final Set stopSet = StopFilter.makeStopSet(STOP_WORDS);
protected static final Set stopSet = StopFilter.makeStopSet(Version.LUCENE_33,STOP_WORDS);
/*
* Create a token stream for this analyzer.
@@ -59,7 +60,7 @@ public class DSAnalyzer extends Analyzer
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
result = new StopFilter(Version.LUCENE_33, result, stopSet);
result = new PorterStemFilter(result);
return result;

View File

@@ -39,8 +39,12 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.dspace.content.Bitstream;
import org.dspace.content.Bundle;
import org.dspace.content.Collection;
@@ -208,22 +212,23 @@ public class DSIndexer
/*
* Create the index directory if it doesn't already exist.
*/
if (!IndexReader.indexExists(indexDirectory))
{
try
{
if (!IndexReader.indexExists(FSDirectory.open(new File(indexDirectory))))
{
if (!new File(indexDirectory).mkdirs())
{
log.error("Unable to create index directory: " + indexDirectory);
}
openIndex(true).close();
}
}
catch (IOException e)
{
throw new IllegalStateException("Could not create search index: " + e.getMessage(),e);
}
}
}
public static void setBatchProcessingMode(boolean mode)
{
@@ -902,8 +907,15 @@ public class DSIndexer
private static IndexWriter openIndex(boolean wipeExisting)
throws IOException
{
Directory dir = FSDirectory.open(new File(indexDirectory));
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_33, getAnalyzer());
if(wipeExisting){
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
}else{
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(indexDirectory, getAnalyzer(), wipeExisting);
IndexWriter writer = new IndexWriter(dir, iwc);
/* Set maximum number of terms to index if present in dspace.cfg */
if (maxfieldlength == -1)
@@ -982,8 +994,8 @@ public class DSIndexer
if (name != null)
{
doc.add(new Field("name", name, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("default", name, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("name", name, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("default", name, Field.Store.NO, Field.Index.ANALYZED));
}
return doc;
@@ -1008,8 +1020,8 @@ public class DSIndexer
if (name != null)
{
doc.add(new Field("name", name, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("default", name, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("name", name, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("default", name, Field.Store.NO, Field.Index.ANALYZED));
}
return doc;
@@ -1062,12 +1074,12 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
DateTools.dateToString(d, DateTools.Resolution.SECOND),
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
doc.add( new Field(indexConfigArr[i].indexName + ".year",
DateTools.dateToString(d, DateTools.Resolution.YEAR),
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
}
}
else if ("date".equalsIgnoreCase(indexConfigArr[i].type))
@@ -1078,12 +1090,12 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
DateTools.dateToString(d, DateTools.Resolution.DAY),
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
doc.add( new Field(indexConfigArr[i].indexName + ".year",
DateTools.dateToString(d, DateTools.Resolution.YEAR),
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
}
}
else
@@ -1099,7 +1111,7 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName+"_authority",
mydc[j].authority,
Field.Store.NO,
Field.Index.UN_TOKENIZED));
Field.Index.NOT_ANALYZED));
boolean valueAlreadyIndexed = false;
if (variants != null)
@@ -1110,7 +1122,7 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
var,
Field.Store.NO,
Field.Index.TOKENIZED));
Field.Index.ANALYZED));
if (var.equals(mydc[j].value))
{
valueAlreadyIndexed = true;
@@ -1121,7 +1133,7 @@ public class DSIndexer
doc.add( new Field("default",
var,
Field.Store.NO,
Field.Index.TOKENIZED));
Field.Index.ANALYZED));
}
}
}
@@ -1132,7 +1144,7 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
mydc[j].value,
Field.Store.NO,
Field.Index.TOKENIZED));
Field.Index.ANALYZED));
}
}
else
@@ -1141,11 +1153,11 @@ public class DSIndexer
doc.add( new Field(indexConfigArr[i].indexName,
mydc[j].value,
Field.Store.NO,
Field.Index.TOKENIZED));
Field.Index.ANALYZED));
}
}
doc.add( new Field("default", mydc[j].value, Field.Store.NO, Field.Index.TOKENIZED));
doc.add( new Field("default", mydc[j].value, Field.Store.NO, Field.Index.ANALYZED));
}
}
}
@@ -1164,7 +1176,7 @@ public class DSIndexer
if (dcv.length > 0)
{
String value = OrderFormat.makeSortString(dcv[0].value, dcv[0].language, so.getType());
doc.add( new Field("sort_" + so.getName(), value, Field.Store.NO, Field.Index.UN_TOKENIZED) );
doc.add( new Field("sort_" + so.getName(), value, Field.Store.NO, Field.Index.NOT_ANALYZED) );
}
}
}
@@ -1230,15 +1242,15 @@ public class DSIndexer
// want to be able to check when last updated
// (not tokenized, but it is indexed)
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "archived", Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "archived", Field.Store.YES, Field.Index.NOT_ANALYZED));
// KEPT FOR BACKWARDS COMPATIBILITY
// do location, type, handle first
doc.add(new Field("type", Integer.toString(type), Field.Store.YES, Field.Index.NO));
// New fields to weaken the dependence on handles, and allow for faster list display
doc.add(new Field("search.resourcetype", Integer.toString(type), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("search.resourcetype", Integer.toString(type), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("search.resourceid", Integer.toString(id), Field.Store.YES, Field.Index.NO));
// want to be able to search for handle, so use keyword
@@ -1246,20 +1258,20 @@ public class DSIndexer
if (handle != null)
{
// ??? not sure what the "handletext" field is but it was there in writeItemIndex ???
doc.add(new Field("handletext", handle, Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("handletext", handle, Field.Store.YES, Field.Index.ANALYZED));
// want to be able to search for handle, so use keyword
// (not tokenized, but it is indexed)
doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.NOT_ANALYZED));
// add to full text index
doc.add(new Field("default", handle, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("default", handle, Field.Store.NO, Field.Index.ANALYZED));
}
if(location != null)
{
doc.add(new Field("location", location, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("default", location, Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("location", location, Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("default", location, Field.Store.NO, Field.Index.ANALYZED));
}
return doc;
@@ -1271,8 +1283,8 @@ public class DSIndexer
// want to be able to check when last updated
// (not tokenized, but it is indexed)
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "deleted", Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "deleted", Field.Store.YES, Field.Index.NOT_ANALYZED));
// Do not add any other fields, as we don't want to be able to find it - just check the last indexed time
@@ -1285,8 +1297,8 @@ public class DSIndexer
// want to be able to check when last updated
// (not tokenized, but it is indexed)
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "withdrawn", Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(DOCUMENT_STATUS_FIELD, "withdrawn", Field.Store.YES, Field.Index.NOT_ANALYZED));
// Do not add any other fields, as we don't want to be able to find it - just check the last indexed time

View File

@@ -13,6 +13,7 @@ import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.util.Version;
/**
* Custom Lucene Analyzer that combines the standard filter, lowercase filter
@@ -32,7 +33,7 @@ public class DSNonStemmingAnalyzer extends DSAnalyzer
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
result = new StopFilter(Version.LUCENE_33, result, stopSet);
return result;
}

View File

@@ -7,6 +7,7 @@
*/
package org.dspace.search;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
@@ -19,12 +20,14 @@ import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.TokenMgrError;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.core.ConfigurationManager;
@@ -113,9 +116,9 @@ public class DSQuery
try
{
// grab a searcher, and do the search
Searcher searcher = getSearcher(c);
IndexSearcher searcher = getSearcher(c);
QueryParser qp = new QueryParser("default", DSIndexer.getAnalyzer());
QueryParser qp = new QueryParser(Version.LUCENE_33, "default", DSIndexer.getAnalyzer());
log.debug("Final query string: " + querystring);
if (operator == null || operator.equals("OR"))
@@ -128,55 +131,28 @@ public class DSQuery
}
Query myquery = qp.parse(querystring);
Hits hits = null;
try
{
if (args.getSortOption() == null)
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", true),
new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder()))
};
hits = searcher.search(myquery, new Sort(sortFields));
}
else
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", true),
new SortField("sort_" + args.getSortOption().getName(), SortOption.DESCENDING.equals(args.getSortOrder())),
SortField.FIELD_SCORE
};
hits = searcher.search(myquery, new Sort(sortFields));
}
}
catch (Exception e)
{
// Lucene can throw an exception if it is unable to determine a sort time from the specified field
// Provide a fall back that just works on relevancy.
log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName()));
hits = searcher.search(myquery, new Sort(SortField.FIELD_SCORE));
}
//Retrieve enough docs to get all the results we need !
TopDocs hits = performQuery(args, searcher, myquery, args.getPageSize() * (args.getStart() + 1));
// set total number of hits
qr.setHitCount(hits.length());
qr.setHitCount(hits.totalHits);
// We now have a bunch of hits - snip out a 'window'
// defined in start, count and return the handles
// from that window
// first, are there enough hits?
if (args.getStart() < hits.length())
if (args.getStart() < hits.totalHits)
{
// get as many as we can, up to the window size
// how many are available after snipping off at offset 'start'?
int hitsRemaining = hits.length() - args.getStart();
int hitsRemaining = hits.totalHits - args.getStart();
int hitsToProcess = (hitsRemaining < args.getPageSize()) ? hitsRemaining
: args.getPageSize();
for (int i = args.getStart(); i < (args.getStart() + hitsToProcess); i++)
{
Document d = hits.doc(i);
Document d = searcher.doc(hits.scoreDocs[i].doc);
String resourceId = d.get("search.resourceid");
String resourceType = d.get("search.resourcetype");
@@ -187,15 +163,15 @@ public class DSQuery
switch (Integer.parseInt( resourceType != null ? resourceType : handleType))
{
case Constants.ITEM:
hitTypes.add(Integer.valueOf(Constants.ITEM));
hitTypes.add(Constants.ITEM);
break;
case Constants.COLLECTION:
hitTypes.add(Integer.valueOf(Constants.COLLECTION));
hitTypes.add(Constants.COLLECTION);
break;
case Constants.COMMUNITY:
hitTypes.add(Integer.valueOf(Constants.COMMUNITY));
hitTypes.add(Constants.COMMUNITY);
break;
}
@@ -230,6 +206,38 @@ public class DSQuery
return qr;
}
private static TopDocs performQuery(QueryArgs args, IndexSearcher searcher, Query myquery, int max) throws IOException {
TopDocs hits;
try
{
if (args.getSortOption() == null)
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", SortField.INT, true),
new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder()))
};
hits = searcher.search(myquery, max, new Sort(sortFields));
}
else
{
SortField[] sortFields = new SortField[] {
new SortField("search.resourcetype", SortField.INT, true),
new SortField("sort_" + args.getSortOption().getName(), SortField.STRING, SortOption.DESCENDING.equals(args.getSortOrder())),
SortField.FIELD_SCORE
};
hits = searcher.search(myquery, max, new Sort(sortFields));
}
}
catch (Exception e)
{
// Lucene can throw an exception if it is unable to determine a sort time from the specified field
// Provide a fall back that just works on relevancy.
log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName()));
hits = searcher.search(myquery, max, new Sort(SortField.FIELD_SCORE));
}
return hits;
}
static String checkEmptyQuery(String myquery)
{
if (myquery == null || myquery.equals("()") || myquery.equals(""))
@@ -359,7 +367,7 @@ public class DSQuery
{
String thisHandle = (String) i.next();
Integer thisType = (Integer) j.next();
String type = Constants.typeText[thisType.intValue()];
String type = Constants.typeText[thisType];
// also look up type
System.out.println(type + "\t" + thisHandle);
@@ -421,7 +429,10 @@ public class DSQuery
// If we have already opened a searcher, check to see if the index has been updated
// If it has, we need to close the existing searcher - we will open a new one later
if (searcher != null && lastModified != IndexReader.getCurrentVersion(indexDir))
Directory searchDir = FSDirectory.open(new File(indexDir));
if (searcher != null && lastModified != IndexReader.getCurrentVersion(searchDir))
{
try
{
@@ -445,17 +456,18 @@ public class DSQuery
if (searcher == null)
{
// So, open a new searcher
lastModified = IndexReader.getCurrentVersion(indexDir);
lastModified = IndexReader.getCurrentVersion(searchDir);
String osName = System.getProperty("os.name");
if (osName != null && osName.toLowerCase().contains("windows"))
{
searcher = new IndexSearcher(indexDir){
searcher = new IndexSearcher(searchDir){
/*
* TODO: Has Lucene fixed this bug yet?
* Lucene doesn't release read locks in
* windows properly on finalize. Our hack
* extend IndexSearcher to force close().
*/
@Override
protected void finalize() throws Throwable {
this.close();
super.finalize();
@@ -464,7 +476,7 @@ public class DSQuery
}
else
{
searcher = new IndexSearcher(indexDir);
searcher = new IndexSearcher(searchDir);
}
}

View File

@@ -28,7 +28,7 @@
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>1.4.1</version>
<version>3.3.0</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>

View File

@@ -81,7 +81,7 @@
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>1.4.1</version>
<version>3.3.0</version>
</dependency>
<dependency>
<groupId>org.dspace.dependencies</groupId>

View File

@@ -46,7 +46,7 @@
<dependency>
<groupId>org.dspace</groupId>
<artifactId>dspace-solr</artifactId>
<version>1.4.1.0</version>
<version>3.3.0.0</version>
<classifier>skinny</classifier>
<type>war</type>
</dependency>
@@ -54,7 +54,7 @@
<dependency>
<groupId>org.dspace</groupId>
<artifactId>dspace-solr</artifactId>
<version>1.4.1.0</version>
<version>3.3.0.0</version>
<classifier>classes</classifier>
<type>jar</type>
</dependency>

View File

@@ -45,14 +45,16 @@
that avoids logging every request
-->
<schema name="example" version="1.2">
<schema name="example" version="1.4">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.2" is Solr's version number for the schema syntax and semantics. It should
version="1.4" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued by nature
1.1: multiValued attribute introduced, false by default
1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
1.3: removed optional field compress feature
1.4: default auto-phrase (QueryParser feature) to off
-->
<types>
@@ -271,16 +273,7 @@
</analyzer>
</fieldType>
<!--
Setup simple analysis for spell checking
-->
<fieldType name="textSpell" class="solr.StrField" positionIncrementGap="100" >
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- A general unstemmed text field - good if one does not know the language of the field -->
<fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
@@ -410,36 +403,6 @@
any data added to them will be ignored outright. -->
<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text
-->
<fieldType name="handleIdentifier" class="solr.StrField" sortMissingLast="true" omitNorms="true">
<analyzer>
<!-- KeywordTokenizer does no actual tokenizing, so the entire
input string is preserved as a single token
-->
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
<!-- The TrimFilter removes any leading or trailing whitespace -->
<filter class="solr.TrimFilterFactory" />
<!-- The PatternReplaceFilter gives you the flexibility to use
Java Regular expression to replace any sequence of characters
matching a pattern with an arbitrary replacement string,
which may include back refrences to portions of the orriginal
string matched by the pattern.
See the Java Regular Expression documentation for more
infomation on pattern and replacement string syntax.
http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
-->
<filter class="solr.PatternReplaceFilterFactory" pattern="http://hdl.handle.net/" replacement="" replace="all"/>
</analyzer>
</fieldType>
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
@@ -580,7 +543,6 @@
results by manufacturer. copied from "manu" via copyField -->
<!--<field name="manu_exact" type="string" indexed="true" stored="false"/>-->
<!--<field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>-->
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have

View File

@@ -347,12 +347,12 @@
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>2.9.3</version>
<version>3.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>2.9.3</version>
<version>3.3.0</version>
</dependency>
<dependency>
<groupId>org.dspace</groupId>