[DS-980] Upgraded solr & lucene to version 3.3.0

git-svn-id: http://scm.dspace.org/svn/repo/dspace/trunk@6545 9c30dcfa-912a-0410-8fc2-9e0234be79fd
2025-10-17 15:03:18 +00:00 · 2011-08-12 19:49:34 +00:00
parent cda73317df
commit e40a2c9441
9 changed files with 1016 additions and 1028 deletions
--- a/dspace-api/src/main/java/org/dspace/search/DSAnalyzer.java
+++ b/dspace-api/src/main/java/org/dspace/search/DSAnalyzer.java
@@ -16,6 +16,7 @@ import org.apache.lucene.analysis.PorterStemFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.util.Version;
 import org.dspace.core.ConfigurationManager;

 /**
@@ -47,7 +48,7 @@ public class DSAnalyzer extends Analyzer
    /*
     * Stop table
     */
-    protected static final Set stopSet = StopFilter.makeStopSet(STOP_WORDS);
+    protected static final Set stopSet = StopFilter.makeStopSet(Version.LUCENE_33,STOP_WORDS);

    /*
     * Create a token stream for this analyzer.
@@ -59,7 +60,7 @@ public class DSAnalyzer extends Analyzer

        result = new StandardFilter(result);
        result = new LowerCaseFilter(result);
-        result = new StopFilter(result, stopSet);
+        result = new StopFilter(Version.LUCENE_33, result, stopSet);
        result = new PorterStemFilter(result);

        return result;
--- a/dspace-api/src/main/java/org/dspace/search/DSIndexer.java
+++ b/dspace-api/src/main/java/org/dspace/search/DSIndexer.java
@@ -39,8 +39,12 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
 import org.dspace.content.Bitstream;
 import org.dspace.content.Bundle;
 import org.dspace.content.Collection;
@@ -208,21 +212,22 @@ public class DSIndexer
        /*
         * Create the index directory if it doesn't already exist.
         */
-        if (!IndexReader.indexExists(indexDirectory))
-    	{
-    		try
+        try
+        {
+            if (!IndexReader.indexExists(FSDirectory.open(new File(indexDirectory))))
            {
-    			if (!new File(indexDirectory).mkdirs())
+
+                if (!new File(indexDirectory).mkdirs())
                {
                    log.error("Unable to create index directory: " + indexDirectory);
                }
-				openIndex(true).close();
-			}
-            catch (IOException e)
-            {
-                throw new IllegalStateException("Could not create search index: " + e.getMessage(),e);
+                openIndex(true).close();
            }
-    	}
+        }
+        catch (IOException e)
+        {
+            throw new IllegalStateException("Could not create search index: " + e.getMessage(),e);
+        }
    }

    public static void setBatchProcessingMode(boolean mode)
@@ -902,8 +907,15 @@ public class DSIndexer
    private static IndexWriter openIndex(boolean wipeExisting)
            throws IOException
    {
-    	
-    	IndexWriter writer = new IndexWriter(indexDirectory, getAnalyzer(), wipeExisting);
+        Directory dir = FSDirectory.open(new File(indexDirectory));
+        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_33, getAnalyzer());
+        if(wipeExisting){
+            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+        }else{
+            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
+        }
+
+        IndexWriter writer = new IndexWriter(dir, iwc);

        /* Set maximum number of terms to index if present in dspace.cfg */
        if (maxfieldlength == -1)
@@ -982,8 +994,8 @@ public class DSIndexer

        if (name != null)
        {
-        	doc.add(new Field("name", name, Field.Store.NO, Field.Index.TOKENIZED));
-        	doc.add(new Field("default", name, Field.Store.NO, Field.Index.TOKENIZED));
+        	doc.add(new Field("name", name, Field.Store.NO, Field.Index.ANALYZED));
+        	doc.add(new Field("default", name, Field.Store.NO, Field.Index.ANALYZED));
        }

        return doc;
@@ -1008,8 +1020,8 @@ public class DSIndexer

        if (name != null)
        {
-        	doc.add(new Field("name", name, Field.Store.NO, Field.Index.TOKENIZED));
-        	doc.add(new Field("default", name, Field.Store.NO, Field.Index.TOKENIZED));
+        	doc.add(new Field("name", name, Field.Store.NO, Field.Index.ANALYZED));
+        	doc.add(new Field("default", name, Field.Store.NO, Field.Index.ANALYZED));
        }

        return doc;
@@ -1062,12 +1074,12 @@ public class DSIndexer
                                doc.add( new Field(indexConfigArr[i].indexName,
                                                   DateTools.dateToString(d, DateTools.Resolution.SECOND),
                                                   Field.Store.NO,
-                                                   Field.Index.UN_TOKENIZED));
+                                                   Field.Index.NOT_ANALYZED));

                                doc.add( new Field(indexConfigArr[i].indexName  + ".year",
                                                    DateTools.dateToString(d, DateTools.Resolution.YEAR),
                                                    Field.Store.NO,
-                                                    Field.Index.UN_TOKENIZED));
+                                                    Field.Index.NOT_ANALYZED));
                            }
                        }
                        else if ("date".equalsIgnoreCase(indexConfigArr[i].type))
@@ -1078,12 +1090,12 @@ public class DSIndexer
                                doc.add( new Field(indexConfigArr[i].indexName,
                                                   DateTools.dateToString(d, DateTools.Resolution.DAY),
                                                   Field.Store.NO,
-                                                   Field.Index.UN_TOKENIZED));
+                                                   Field.Index.NOT_ANALYZED));

                                doc.add( new Field(indexConfigArr[i].indexName  + ".year",
                                                    DateTools.dateToString(d, DateTools.Resolution.YEAR),
                                                    Field.Store.NO,
-                                                    Field.Index.UN_TOKENIZED));
+                                                    Field.Index.NOT_ANALYZED));
                            }
                        }
                        else
@@ -1099,7 +1111,7 @@ public class DSIndexer
                                doc.add( new Field(indexConfigArr[i].indexName+"_authority",
                                   mydc[j].authority,
                                   Field.Store.NO,
-                                   Field.Index.UN_TOKENIZED));
+                                   Field.Index.NOT_ANALYZED));

                                boolean valueAlreadyIndexed = false;
                                if (variants != null)
@@ -1110,7 +1122,7 @@ public class DSIndexer
                                        doc.add( new Field(indexConfigArr[i].indexName,
                                                           var,
                                                           Field.Store.NO,
-                                                           Field.Index.TOKENIZED));
+                                                           Field.Index.ANALYZED));
                                        if (var.equals(mydc[j].value))
                                        {
                                            valueAlreadyIndexed = true;
@@ -1121,7 +1133,7 @@ public class DSIndexer
                                             doc.add( new Field("default",
                                                       var,
                                                       Field.Store.NO,
-                                                       Field.Index.TOKENIZED));
+                                                       Field.Index.ANALYZED));
                                        }
                                    }
                                }
@@ -1132,7 +1144,7 @@ public class DSIndexer
                                    doc.add( new Field(indexConfigArr[i].indexName,
                                                       mydc[j].value,
                                                       Field.Store.NO,
-                                                       Field.Index.TOKENIZED));
+                                                       Field.Index.ANALYZED));
                                }
                            }
                            else
@@ -1141,11 +1153,11 @@ public class DSIndexer
 	                            doc.add( new Field(indexConfigArr[i].indexName,
 	                                               mydc[j].value,
 	                                               Field.Store.NO,
-	                                               Field.Index.TOKENIZED));
+	                                               Field.Index.ANALYZED));
                        	}
                        }

-                        doc.add( new Field("default", mydc[j].value, Field.Store.NO, Field.Index.TOKENIZED));
+                        doc.add( new Field("default", mydc[j].value, Field.Store.NO, Field.Index.ANALYZED));
                    }
                }
            }
@@ -1164,7 +1176,7 @@ public class DSIndexer
                if (dcv.length > 0)
                {
                    String value = OrderFormat.makeSortString(dcv[0].value, dcv[0].language, so.getType());
-                    doc.add( new Field("sort_" + so.getName(), value, Field.Store.NO, Field.Index.UN_TOKENIZED) );
+                    doc.add( new Field("sort_" + so.getName(), value, Field.Store.NO, Field.Index.NOT_ANALYZED) );
                }
            }
        }
@@ -1230,15 +1242,15 @@ public class DSIndexer

        // want to be able to check when last updated
        // (not tokenized, but it is indexed)
-        doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
-        doc.add(new Field(DOCUMENT_STATUS_FIELD, "archived", Field.Store.YES, Field.Index.UN_TOKENIZED));
+        doc.add(new Field(LAST_INDEXED_FIELD, Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
+        doc.add(new Field(DOCUMENT_STATUS_FIELD, "archived", Field.Store.YES, Field.Index.NOT_ANALYZED));

        // KEPT FOR BACKWARDS COMPATIBILITY
        // do location, type, handle first
        doc.add(new Field("type", Integer.toString(type), Field.Store.YES, Field.Index.NO));

        // New fields to weaken the dependence on handles, and allow for faster list display
-        doc.add(new Field("search.resourcetype", Integer.toString(type), Field.Store.YES, Field.Index.UN_TOKENIZED));
+        doc.add(new Field("search.resourcetype", Integer.toString(type), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("search.resourceid",   Integer.toString(id),   Field.Store.YES, Field.Index.NO));

        // want to be able to search for handle, so use keyword
@@ -1246,20 +1258,20 @@ public class DSIndexer
        if (handle != null)
        {
            // ??? not sure what the "handletext" field is but it was there in writeItemIndex ???
-            doc.add(new Field("handletext", handle, Field.Store.YES, Field.Index.TOKENIZED));
+            doc.add(new Field("handletext", handle, Field.Store.YES, Field.Index.ANALYZED));

            // want to be able to search for handle, so use keyword
            // (not tokenized, but it is indexed)
-            doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.UN_TOKENIZED));
+            doc.add(new Field("handle", handle, Field.Store.YES, Field.Index.NOT_ANALYZED));

            // add to full text index
-            doc.add(new Field("default", handle, Field.Store.NO, Field.Index.TOKENIZED));
+            doc.add(new Field("default", handle, Field.Store.NO, Field.Index.ANALYZED));
        }

        if(location != null)
        {
-            doc.add(new Field("location", location, Field.Store.NO, Field.Index.TOKENIZED));
-    	    doc.add(new Field("default", location, Field.Store.NO, Field.Index.TOKENIZED));
+            doc.add(new Field("location", location, Field.Store.NO, Field.Index.ANALYZED));
+    	    doc.add(new Field("default", location, Field.Store.NO, Field.Index.ANALYZED));
        }

        return doc;
@@ -1271,8 +1283,8 @@ public class DSIndexer

        // want to be able to check when last updated
        // (not tokenized, but it is indexed)
-        doc.add(new Field(LAST_INDEXED_FIELD,    Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
-        doc.add(new Field(DOCUMENT_STATUS_FIELD, "deleted", Field.Store.YES, Field.Index.UN_TOKENIZED));
+        doc.add(new Field(LAST_INDEXED_FIELD,    Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
+        doc.add(new Field(DOCUMENT_STATUS_FIELD, "deleted", Field.Store.YES, Field.Index.NOT_ANALYZED));

        // Do not add any other fields, as we don't want to be able to find it - just check the last indexed time

@@ -1285,8 +1297,8 @@ public class DSIndexer

        // want to be able to check when last updated
        // (not tokenized, but it is indexed)
-        doc.add(new Field(LAST_INDEXED_FIELD,    Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.UN_TOKENIZED));
-        doc.add(new Field(DOCUMENT_STATUS_FIELD, "withdrawn", Field.Store.YES, Field.Index.UN_TOKENIZED));
+        doc.add(new Field(LAST_INDEXED_FIELD,    Long.toString(System.currentTimeMillis()), Field.Store.YES, Field.Index.NOT_ANALYZED));
+        doc.add(new Field(DOCUMENT_STATUS_FIELD, "withdrawn", Field.Store.YES, Field.Index.NOT_ANALYZED));

        // Do not add any other fields, as we don't want to be able to find it - just check the last indexed time

--- a/dspace-api/src/main/java/org/dspace/search/DSNonStemmingAnalyzer.java
+++ b/dspace-api/src/main/java/org/dspace/search/DSNonStemmingAnalyzer.java
@@ -13,6 +13,7 @@ import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.util.Version;

 /**
 * Custom Lucene Analyzer that combines the standard filter, lowercase filter
@@ -32,7 +33,7 @@ public class DSNonStemmingAnalyzer extends DSAnalyzer

        result = new StandardFilter(result);
        result = new LowerCaseFilter(result);
-        result = new StopFilter(result, stopSet);
+        result = new StopFilter(Version.LUCENE_33, result, stopSet);

        return result;
    }
--- a/dspace-api/src/main/java/org/dspace/search/DSQuery.java
+++ b/dspace-api/src/main/java/org/dspace/search/DSQuery.java
@@ -7,6 +7,7 @@
 */
 package org.dspace.search;

+import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
@@ -19,12 +20,14 @@ import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.queryParser.TokenMgrError;
 import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.Hits;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
 import org.dspace.content.Collection;
 import org.dspace.content.Community;
 import org.dspace.core.ConfigurationManager;
@@ -113,9 +116,9 @@ public class DSQuery
        try
        {
            // grab a searcher, and do the search
-            Searcher searcher = getSearcher(c);
+            IndexSearcher searcher = getSearcher(c);

-            QueryParser qp = new QueryParser("default", DSIndexer.getAnalyzer());
+            QueryParser qp = new QueryParser(Version.LUCENE_33, "default", DSIndexer.getAnalyzer());
            log.debug("Final query string: " + querystring);
            
            if (operator == null || operator.equals("OR"))
@@ -126,57 +129,30 @@ public class DSQuery
            {
            	qp.setDefaultOperator(QueryParser.AND_OPERATOR);
            }
-            
-            Query myquery = qp.parse(querystring);
-            Hits hits = null;

-            try
-            {
-                if (args.getSortOption() == null)
-                {
-                    SortField[] sortFields = new SortField[] {
-                            new SortField("search.resourcetype", true),
-                            new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder()))
-                        };
-                    hits = searcher.search(myquery, new Sort(sortFields));
-                }
-                else
-                {
-                    SortField[] sortFields = new SortField[] {
-                            new SortField("search.resourcetype", true),
-                            new SortField("sort_" + args.getSortOption().getName(), SortOption.DESCENDING.equals(args.getSortOrder())),
-                            SortField.FIELD_SCORE
-                        };
-                    hits = searcher.search(myquery, new Sort(sortFields));
-                }
-            }
-            catch (Exception e)
-            {
-                // Lucene can throw an exception if it is unable to determine a sort time from the specified field
-                // Provide a fall back that just works on relevancy.
-                log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName()));
-                hits = searcher.search(myquery, new Sort(SortField.FIELD_SCORE));
-            }
-            
+            Query myquery = qp.parse(querystring);
+            //Retrieve enough docs to get all the results we need !
+            TopDocs  hits = performQuery(args, searcher, myquery, args.getPageSize() * (args.getStart() + 1));
+
            // set total number of hits
-            qr.setHitCount(hits.length());
+            qr.setHitCount(hits.totalHits);

            // We now have a bunch of hits - snip out a 'window'
            // defined in start, count and return the handles
            // from that window
            // first, are there enough hits?
-            if (args.getStart() < hits.length())
+            if (args.getStart() < hits.totalHits)
            {
                // get as many as we can, up to the window size
                // how many are available after snipping off at offset 'start'?
-                int hitsRemaining = hits.length() - args.getStart();
+                int hitsRemaining = hits.totalHits - args.getStart();

                int hitsToProcess = (hitsRemaining < args.getPageSize()) ? hitsRemaining
                        : args.getPageSize();

                for (int i = args.getStart(); i < (args.getStart() + hitsToProcess); i++)
                {
-                    Document d = hits.doc(i);
+                    Document d = searcher.doc(hits.scoreDocs[i].doc);

                    String resourceId   = d.get("search.resourceid");
                    String resourceType = d.get("search.resourcetype");
@@ -187,15 +163,15 @@ public class DSQuery
                    switch (Integer.parseInt( resourceType != null ? resourceType : handleType))
                    {
                        case Constants.ITEM:
-                            hitTypes.add(Integer.valueOf(Constants.ITEM));
+                            hitTypes.add(Constants.ITEM);
                            break;

                        case Constants.COLLECTION:
-                            hitTypes.add(Integer.valueOf(Constants.COLLECTION));
+                            hitTypes.add(Constants.COLLECTION);
                            break;

                        case Constants.COMMUNITY:
-                            hitTypes.add(Integer.valueOf(Constants.COMMUNITY));
+                            hitTypes.add(Constants.COMMUNITY);
                            break;
                    }

@@ -230,6 +206,38 @@ public class DSQuery
        return qr;
    }

+    private static TopDocs performQuery(QueryArgs args, IndexSearcher searcher, Query myquery, int max) throws IOException {
+        TopDocs hits;
+        try
+        {
+            if (args.getSortOption() == null)
+            {
+                SortField[] sortFields = new SortField[] {
+                        new SortField("search.resourcetype", SortField.INT, true),
+                        new SortField(null, SortField.SCORE, SortOption.ASCENDING.equals(args.getSortOrder()))
+                    };
+                hits = searcher.search(myquery, max, new Sort(sortFields));
+            }
+            else
+            {
+                SortField[] sortFields = new SortField[] {
+                        new SortField("search.resourcetype", SortField.INT, true),
+                        new SortField("sort_" + args.getSortOption().getName(), SortField.STRING, SortOption.DESCENDING.equals(args.getSortOrder())),
+                        SortField.FIELD_SCORE
+                    };
+                hits = searcher.search(myquery, max, new Sort(sortFields));
+            }
+        }
+        catch (Exception e)
+        {
+            // Lucene can throw an exception if it is unable to determine a sort time from the specified field
+            // Provide a fall back that just works on relevancy.
+            log.error("Unable to use speficied sort option: " + (args.getSortOption() == null ? "type/relevance": args.getSortOption().getName()));
+            hits = searcher.search(myquery, max, new Sort(SortField.FIELD_SCORE));
+        }
+        return hits;
+    }
+
    static String checkEmptyQuery(String myquery)
    {
        if (myquery == null || myquery.equals("()") || myquery.equals(""))
@@ -359,7 +367,7 @@ public class DSQuery
            {
                String thisHandle = (String) i.next();
                Integer thisType = (Integer) j.next();
-                String type = Constants.typeText[thisType.intValue()];
+                String type = Constants.typeText[thisType];

                // also look up type
                System.out.println(type + "\t" + thisHandle);
@@ -421,7 +429,10 @@ public class DSQuery
       
        // If we have already opened a searcher, check to see if the index has been updated
        // If it has, we need to close the existing searcher - we will open a new one later
-        if (searcher != null && lastModified != IndexReader.getCurrentVersion(indexDir))
+
+        Directory searchDir = FSDirectory.open(new File(indexDir));
+
+        if (searcher != null && lastModified != IndexReader.getCurrentVersion(searchDir))
        {
            try
            {
@@ -445,17 +456,18 @@ public class DSQuery
        if (searcher == null)
        {
            // So, open a new searcher
-            lastModified = IndexReader.getCurrentVersion(indexDir);
+            lastModified = IndexReader.getCurrentVersion(searchDir);
            String osName = System.getProperty("os.name");
            if (osName != null && osName.toLowerCase().contains("windows"))
            {
-                searcher = new IndexSearcher(indexDir){
+                searcher = new IndexSearcher(searchDir){
                    /*
                     * TODO: Has Lucene fixed this bug yet?
                     * Lucene doesn't release read locks in
                     * windows properly on finalize. Our hack
                     * extend IndexSearcher to force close().
                     */
+                    @Override
                    protected void finalize() throws Throwable {
                        this.close();
                        super.finalize();
@@ -464,7 +476,7 @@ public class DSQuery
            }
            else
            {
-                searcher = new IndexSearcher(indexDir);
+                searcher = new IndexSearcher(searchDir);
            }
        }