[DS-440] Adjust SolrLogger and rest of Statistics system to support processing multiple statistics files. Prempt logging spider IPs and prune spider IPs from Solr with utility methods.

git-svn-id: http://scm.dspace.org/svn/repo/dspace/trunk@4745 9c30dcfa-912a-0410-8fc2-9e0234be79fd
2025-10-07 01:54:22 +00:00 · 2010-02-07 17:44:17 +00:00
parent a5beae59c2
commit 73edd7a585
9 changed files with 686 additions and 176 deletions
--- a/dspace-stats/src/main/java/org/dspace/statistics/SolrLogger.java
+++ b/dspace-stats/src/main/java/org/dspace/statistics/SolrLogger.java
@@ -10,18 +10,8 @@
 */
 package org.dspace.statistics;

-import java.io.File;
-import java.io.IOException;
-import java.sql.SQLException;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Vector;
-
+import com.maxmind.geoip.Location;
+import com.maxmind.geoip.LookupService;
 import org.apache.commons.lang.time.DateFormatUtils;
 import org.apache.log4j.Logger;
 import org.apache.solr.client.solrj.SolrQuery;
@@ -33,21 +23,20 @@ import org.apache.solr.client.solrj.util.ClientUtils;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.MapSolrParams;
-import org.dspace.content.Bitstream;
-import org.dspace.content.Bundle;
+import org.dspace.content.*;
 import org.dspace.content.Collection;
-import org.dspace.content.Community;
-import org.dspace.content.DCValue;
-import org.dspace.content.DSpaceObject;
-import org.dspace.content.Item;
 import org.dspace.core.ConfigurationManager;
 import org.dspace.eperson.EPerson;
 import org.dspace.statistics.util.DnsLookup;
 import org.dspace.statistics.util.LocationUtils;
 import org.dspace.statistics.util.SpiderDetector;

-import com.maxmind.geoip.Location;
-import com.maxmind.geoip.LookupService;
+import javax.servlet.http.HttpServletRequest;
+import java.io.IOException;
+import java.sql.SQLException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;

 /**
 * Static SolrLogger used to hold HttpSolrClient connection pool to issue
@@ -55,6 +44,7 @@ import com.maxmind.geoip.LookupService;
 * 
 * @author ben at atmire.com
 * @author kevinvandevelde at atmire.com
+ * @author mdiggory at atmire.com
 */
 public class SolrLogger
 {
@@ -69,8 +59,6 @@ public class SolrLogger

    private static final LookupService locationService;

-    private static final Vector<String> spiderIps;
-
    private static final boolean useProxies;

    private static Map metadataStorageInfo;
@@ -80,7 +68,6 @@ public class SolrLogger
    	log.info("solr.spidersfile:" + ConfigurationManager.getProperty("solr.spidersfile"));
 		log.info("solr.log.server:" + ConfigurationManager.getProperty("solr.log.server"));
 		log.info("solr.dbfile:" + ConfigurationManager.getProperty("solr.dbfile"));
-		log.info("spiders file:" + ConfigurationManager.getProperty("solr.spidersfile"));
    	
        CommonsHttpSolrServer server = null;
        
@@ -99,18 +86,7 @@ public class SolrLogger
        solr = server;

        // Read in the file so we don't have to do it all the time
-        Vector<String> spiderIpsLoc;
-        String filePath = ConfigurationManager.getProperty("solr.spidersfile");
-        try
-        {
-            spiderIpsLoc = SpiderDetector.readIpAddresses(new File(filePath));
-        }
-        catch (Exception e)
-        {
-            spiderIpsLoc = new Vector<String>();
-            e.printStackTrace(); // Should never happen
-        }
-        spiderIps = spiderIpsLoc;
+        //spiderIps = SpiderDetector.getSpiderIpAddresses();

        LookupService service = null;
        // Get the db file for the location
@@ -154,16 +130,47 @@ public class SolrLogger
        }
    }

-    public static void post(DSpaceObject dspaceObject, String ip,
+    public static void post(DSpaceObject dspaceObject, HttpServletRequest request,
            EPerson currentUser)
    {
        if (solr == null || locationService == null)
            return;
+
+        boolean isSpiderBot = SpiderDetector.isSpider(request);
+
        try
        {
+            if(isSpiderBot &&
+                    !ConfigurationManager.getBooleanProperty("solr.statistics.logBots",true))
+            {
+                return;
+            }
+
+
+                        
            SolrInputDocument doc1 = new SolrInputDocument();
            // Save our basic info that we already have
+
+            String ip = request.getRemoteAddr();
+
+	        if(isUseProxies() && request.getHeader("X-Forwarded-For") != null)
+            {
+                /* This header is a comma delimited list */
+	            for(String xfip : request.getHeader("X-Forwarded-For").split(","))
+                {
+                    /* proxy itself will sometime populate this header with the same value in
+                        remote address. ordering in spec is vague, we'll just take the last
+                        not equal to the proxy
+                    */
+                    if(!request.getHeader("X-Forwarded-For").contains(ip))
+                    {
+                        ip = xfip.trim();
+                    }
+                }
+	        }
+
            doc1.addField("ip", ip);
+                        
            doc1.addField("id", dspaceObject.getID());
            doc1.addField("type", dspaceObject.getType());
            // Save the current time
@@ -203,7 +210,12 @@ public class SolrLogger
                doc1.addField("city", location.city);
                doc1.addField("latitude", location.latitude);
                doc1.addField("longitude", location.longitude);
+                doc1.addField("isBot",isSpiderBot);
+
+                if(request.getHeader("User-Agent") != null)
+                    doc1.addField("userAgent", request.getHeader("User-Agent"));
            }
+            
            if (dspaceObject instanceof Item)
            {
                Item item = (Item) dspaceObject;
@@ -226,6 +238,7 @@ public class SolrLogger
                }
            }

+
            storeParents(doc1, dspaceObject);

            solr.add(doc1);
@@ -355,6 +368,136 @@ public class SolrLogger
        return currentValsStored;
    }

+
+    public static class ResultProcessor
+    {
+
+        public void execute(String query) throws SolrServerException, IOException {
+            Map<String, String> params = new HashMap<String, String>();
+            params.put("q", query);
+            params.put("rows", "10");
+            MapSolrParams solrParams = new MapSolrParams(params);
+            QueryResponse response = solr.query(solrParams);
+            
+            long numbFound = response.getResults().getNumFound();
+
+            // process the first batch
+            process(response.getResults());
+
+            // Run over the rest
+            for (int i = 10; i < numbFound; i += 10)
+            {
+                params.put("start", String.valueOf(i));
+                solrParams = new MapSolrParams(params);
+                response = solr.query(solrParams);
+                process(response.getResults());
+            }
+
+        }
+
+        public void commit() throws IOException, SolrServerException {
+            solr.commit();
+        }
+
+        /**
+         * Override to manage pages of documents
+         * @param docs
+         */
+        public void process(List<SolrDocument> docs) throws IOException, SolrServerException {
+            for(SolrDocument doc : docs){
+                process(doc);
+            }
+        }
+
+        /**
+         * Overide to manage individual documents
+         * @param doc
+         */
+        public void process(SolrDocument doc) throws IOException, SolrServerException {
+
+
+        }
+    }
+
+
+    public static void markRobotsByIP()
+    {
+        for(String ip : SpiderDetector.getSpiderIpAddresses()){
+
+            try {
+
+                /* Result Process to alter record to be identified as a bot */
+                ResultProcessor processor = new ResultProcessor(){
+                    public void process(SolrDocument doc) throws IOException, SolrServerException {
+                        doc.removeFields("isBot");
+                        doc.addField("isBot", true);
+                        SolrInputDocument newInput = ClientUtils.toSolrInputDocument(doc);
+                        solr.add(newInput);
+                    }
+                };
+
+                /* query for ip, exclude results previously set as bots. */
+                processor.execute("ip:"+ip+ "* AND NOT isBot:true");
+
+                solr.commit();
+
+            } catch (Exception e) {
+                log.error(e.getMessage(),e);
+            }
+
+
+        }
+
+    }
+
+    public static void markRobotByUserAgent(String agent){
+        try {
+
+                /* Result Process to alter record to be identified as a bot */
+                ResultProcessor processor = new ResultProcessor(){
+                    public void process(SolrDocument doc) throws IOException, SolrServerException {
+                        doc.removeFields("isBot");
+                        doc.addField("isBot", true);
+                        SolrInputDocument newInput = ClientUtils.toSolrInputDocument(doc);
+                        solr.add(newInput);
+                    }
+                };
+
+                /* query for ip, exclude results previously set as bots. */
+                processor.execute("userAgent:"+agent+ " AND NOT isBot:true");
+
+                solr.commit();
+            } catch (Exception e) {
+                log.error(e.getMessage(),e);
+            }
+    }
+
+    public static void deleteRobotsByIsBotFlag()
+    {
+        try {
+           solr.deleteByQuery("isBot:true");
+        } catch (Exception e) {
+           log.error(e.getMessage(),e);
+        }
+    }
+
+    public static void deleteIP(String ip)
+    {
+        try {
+            solr.deleteByQuery("ip:"+ip + "*");
+        } catch (Exception e) {
+            log.error(e.getMessage(),e);
+        }
+    }
+
+
+    public static void deleteRobotsByIP()
+    {
+        for(String ip : SpiderDetector.getSpiderIpAddresses()){
+            deleteIP(ip);
+        }
+    }
+
    /*
     * //TODO: below are not used public static void
     * update(String query, boolean addField, String fieldName, Object
@@ -372,25 +515,18 @@ public class SolrLogger
        // We need to get our documents
        // QueryResponse queryResponse = solr.query()//query(query, null, -1,
        // null, null, null);
-        Map<String, String> params = new HashMap<String, String>();
-        params.put("q", query);
-        params.put("rows", "10");
-        MapSolrParams solrParams = new MapSolrParams(params);
-        QueryResponse response = solr.query(solrParams);

-        long numbFound = response.getResults().getNumFound();
-        List<SolrDocument> docsToUpdate = new ArrayList<SolrDocument>();
-        docsToUpdate.addAll(response.getResults());
+        final List<SolrDocument> docsToUpdate = new ArrayList<SolrDocument>();

-        // Run over the rest
-        for (int i = 10; i < numbFound; i += 10)
-        {
-            params.put("start", String.valueOf(i));
-            solrParams = new MapSolrParams(params);
-            response = solr.query(solrParams);
-            docsToUpdate.addAll(response.getResults());
-        }
-        // We have all the docs delete the once we don't need
+        ResultProcessor processor = new ResultProcessor(){
+                public void process(List<SolrDocument> docs) throws IOException, SolrServerException {
+                    docsToUpdate.addAll(docs);
+                }
+            };
+
+        processor.execute(query);
+
+        // We have all the docs delete the ones we don't need
        solr.deleteByQuery(query);

        // Add the new (updated onces
@@ -680,8 +816,17 @@ public class SolrLogger

        // A filter is used instead of a regular query to improve
        // performance and ensure the search result ordering will
-        // not be influenced 
-        solrQuery.addFilterQuery(getIgnoreSpiders());
+        // not be influenced
+
+        // Choose to filter by the Legacy spider IP list (may get too long to properly filter all IP's
+        if(ConfigurationManager.getBooleanProperty("solr.statistics.query.filter.spiderIp",false))
+            solrQuery.addFilterQuery(getIgnoreSpiderIPs());
+
+        // Choose to filter by isBot field, may be overriden in future
+        // to allow views on stats based on bots.
+        if(ConfigurationManager.getBooleanProperty("solr.statistics.query.filter.isBot",true))
+            solrQuery.addFilterQuery("-isBot:true");
+
        if (filterQuery != null)
            solrQuery.addFilterQuery(filterQuery);

@@ -699,20 +844,32 @@ public class SolrLogger
        return response;
    }

+
+    /** String of IP and Ranges in IPTable as a Solr Query */
+    private static String filterQuery = null;
+
    /**
-     * Returns in a query string all the ip addresses that should be ignored
-     * 
+     * Returns in a filterQuery string all the ip addresses that should be ignored
+     *
     * @return a string query with ip addresses
     */
-    private static String getIgnoreSpiders()
-    {
-        String query = "";
-        for (int i = 0; i < spiderIps.size(); i++)
-        {
-            String ip = spiderIps.elementAt(i);
+    public static String getIgnoreSpiderIPs() {
+        if (filterQuery == null) {
+            String query = "";
+            boolean first = true;
+            for (String ip : SpiderDetector.getSpiderIpAddresses()) {
+                if (first) {
+                    query += " AND ";
+                    first = false;
+                }

-            query += (i != 0 ? " AND " : "") + "NOT(ip: " + ip + ")";
+                query += " NOT(ip: " + ip + ")";
+            }
+            filterQuery = query;
        }
-        return query;
+
+        return filterQuery;
+
    }
+    
 }
--- a/dspace-stats/src/main/java/org/dspace/statistics/SolrLoggerUsageEventListener.java
+++ b/dspace-stats/src/main/java/org/dspace/statistics/SolrLoggerUsageEventListener.java
@@ -13,6 +13,7 @@ package org.dspace.statistics;
 import org.apache.log4j.Logger;
 import org.dspace.eperson.EPerson;
 import org.dspace.services.model.Event;
+import org.dspace.statistics.util.SpiderDetector;
 import org.dspace.usage.AbstractUsageEventListener;
 import org.dspace.usage.UsageEvent;

@@ -33,21 +34,12 @@ public class SolrLoggerUsageEventListener extends AbstractUsageEventListener {
 		{
 			try{
 			
-			UsageEvent ue = (UsageEvent)event;
+			    UsageEvent ue = (UsageEvent)event;
 			
-			String ip = null;
-			
-	        if(SolrLogger.isUseProxies())
-	            ip = ue.getRequest().getHeader("X-Forwarded-For");
-	        
-	        if(ip == null || ip.equals(""))
-	            ip = ue.getRequest().getRemoteAddr();
+			    EPerson currentUser = ue.getContext() == null ? null : ue.getContext().getCurrentUser();

-	        EPerson currentUser = ue.getContext() == null ? null : ue.getContext().getCurrentUser();
+                SolrLogger.post(ue.getObject(), ue.getRequest(), currentUser);

-	        SolrLogger.post(ue.getObject(), ip, currentUser);
-			
-	    	
 			}
 			catch(Exception e)
 			{
--- a/dspace-stats/src/main/java/org/dspace/statistics/util/ApacheLogRobotsProcessor.java
+++ b/dspace-stats/src/main/java/org/dspace/statistics/util/ApacheLogRobotsProcessor.java
@@ -0,0 +1,97 @@
+/**
+ * $Id: $
+ * $URL: $
+ * *************************************************************************
+ * Copyright (c) 2002-2009, DuraSpace.  All rights reserved
+ * Licensed under the DuraSpace Foundation License.
+ *
+ * A copy of the DuraSpace License has been included in this
+ * distribution and is available at: http://scm.dspace.org/svn/repo/licenses/LICENSE.txt
+ */
+package org.dspace.statistics.util;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.PosixParser;
+
+import java.io.*;
+import java.util.HashSet;
+
+/**
+ * @author Mark Diggory (mdiggory at atmire.com)
+ * @author kevinvandevelde at atmire.com
+ * @author ben at atmire.com
+ */
+public class ApacheLogRobotsProcessor {
+
+
+    /**
+     * Creates a file containing spiders based on an apache logfile
+     * by analyzing users of the robots.txt file
+     *
+     * @param args
+     * @throws Exception
+     */
+
+    public static void main(String[] args) throws Exception {
+        // create an options object and populate it
+        CommandLineParser parser = new PosixParser();
+
+        Options options = new Options();
+        options.addOption("l", "logfile", true, "type: Input log file");
+        options.addOption("s", "spiderfile", true, "type: Spider ip file");
+
+        CommandLine line = parser.parse(options, args);
+
+        String logFileLoc;
+        String spiderIpPath;
+        if (line.hasOption("l"))
+            logFileLoc = line.getOptionValue("l");
+        else {
+            System.out.println("We need our log file");
+            return;
+        }
+        if (line.hasOption("s"))
+            spiderIpPath = line.getOptionValue("s");
+        else {
+            System.out.println("We need a spider ip output file");
+            return;
+        }
+
+        File spiderIpFile = new File(spiderIpPath);
+
+        //Get the ip's already added in our file
+        HashSet<String> logSpiders = new HashSet<String>();
+        if (spiderIpFile.exists())
+            logSpiders = SpiderDetector.readIpAddresses(spiderIpFile);
+
+
+        //First read in our log file line per line
+        BufferedReader in = new BufferedReader(new FileReader(logFileLoc));
+        String logLine;
+        while ((logLine = in.readLine()) != null) {
+            //Currently only check if robot.txt is present in our line
+            if (logLine.contains("robots.txt")) {
+                //We got a robots.txt so we got a bot
+                String ip = logLine.substring(0, logLine.indexOf("-")).trim();
+                //Only add single ip addresses once we got it in it is enough
+                logSpiders.add(ip);
+            }
+        }
+        in.close();
+
+        //Last but not least add the ips to our file
+        BufferedWriter output = new BufferedWriter(new FileWriter(spiderIpFile));
+
+        //Second write the new ips
+        for (String ip : logSpiders) {
+            System.out.println("Adding new ip: " + ip);
+            //Write each new ip on a seperate line
+            output.write(ip + "\n");
+        }
+
+        output.flush();
+        output.close();
+    }
+}
--- a/dspace-stats/src/main/java/org/dspace/statistics/util/IPTable.java
+++ b/dspace-stats/src/main/java/org/dspace/statistics/util/IPTable.java
@@ -0,0 +1,184 @@
+/**
+ * $Id: $
+ * $URL: $
+ * *************************************************************************
+ * Copyright (c) 2002-2009, DuraSpace.  All rights reserved
+ * Licensed under the DuraSpace Foundation License.
+ *
+ * A copy of the DuraSpace License has been included in this
+ * distribution and is available at: http://scm.dspace.org/svn/repo/licenses/LICENSE.txt
+ */
+package org.dspace.statistics.util;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * A Spare v4 IPTable implementation that uses nested HashMaps
+ * TO optimize IP Address matching over ranges of IP Addresses.
+ *
+ * @author: mdiggory at atmire.com
+ */
+public class IPTable {
+
+    /* A lookup tree for IP Addresses and SubnetRanges */
+    private HashMap<String, HashMap<String, HashMap<String, HashSet<String>>>> map =
+            new HashMap<String, HashMap<String, HashMap<String, HashSet<String>>>>();
+
+    /**
+     * Can be full v4 IP, subnet or range string
+     *
+     * @param ip
+     */
+    public void add(String ip) throws IPFormatException {
+
+        String[] start;
+
+        String[] end;
+
+        String[] range = ip.split("-");
+
+        if (range.length >= 2) {
+
+            start = range[0].trim().split("/")[0].split("\\.");
+            end = range[1].trim().split("/")[0].split("\\.");
+
+            if (start.length != 4 || end.length != 4)
+                throw new IPFormatException(ip + " - Ranges need to be full IPv4 Addresses");
+
+            if (!(start[0].equals(end[0]) && start[1].equals(end[1]) && start[2].equals(end[2]))) {
+                throw new IPFormatException(ip + " - Ranges can only be across the last subnet x.y.z.0 - x.y.z.254");
+            }
+
+        } else {
+            //need to ignore CIDR notation for the moment.
+            //ip = ip.split("\\/")[0];
+
+            String[] subnets = ip.split("\\.");
+
+            if (subnets.length < 3) {
+                throw new IPFormatException(ip + " - require at least three subnet places (255.255.255.0");
+
+            }
+
+            start = subnets;
+            end = subnets;
+        }
+
+        if (start.length >= 3) {
+
+
+            HashMap<String, HashMap<String, HashSet<String>>> first = map.get(start[0]);
+
+            if (first == null) {
+                first = new HashMap<String, HashMap<String, HashSet<String>>>();
+                map.put(start[0], first);
+            }
+
+            HashMap<String, HashSet<String>> second = first.get(start[1]);
+
+
+            if (second == null) {
+                second = new HashMap<String, HashSet<String>>();
+                first.put(start[1], second);
+            }
+
+            HashSet<String> third = second.get(start[2]);
+
+            if (third == null) {
+                third = new HashSet<String>();
+                second.put(start[2], third);
+            }
+
+            //now populate fourth place (* or value 0-254);
+
+            if (start.length == 3) {
+                third.add("*");
+            }
+
+            if (third.contains("*")) {
+                return;
+            }
+
+            if (start.length >= 4) {
+                int s = Integer.valueOf(start[3]);
+                int e = Integer.valueOf(end[3]);
+                for (int i = s; i <= e; i++) {
+                    third.add(String.valueOf(i));
+                }
+            }
+        }
+    }
+
+    public boolean contains(String ip) throws IPFormatException {
+
+        String[] subnets = ip.split("\\.");
+
+        if (subnets.length != 4)
+            throw new IPFormatException("needs to be single IP Address");
+
+        HashMap<String, HashMap<String, HashSet<String>>> first = map.get(subnets[0]);
+
+        if (first == null) return false;
+
+        HashMap<String, HashSet<String>> second = first.get(subnets[1]);
+
+        if (second == null) return false;
+
+        HashSet<String> third = second.get(subnets[2]);
+
+        if (third == null) return false;
+
+        return third.contains(subnets[3]) || third.contains("*");
+
+    }
+
+    /**
+     * @return
+     */
+    public Set<String> toSet() {
+        HashSet<String> set = new HashSet<String>();
+
+        for (Map.Entry<String, HashMap<String, HashMap<String, HashSet<String>>>> first : map.entrySet()) {
+            String firstString = first.getKey();
+            HashMap<String, HashMap<String, HashSet<String>>> secondMap = first.getValue();
+
+            for (Map.Entry<String, HashMap<String, HashSet<String>>> second : secondMap.entrySet()) {
+                String secondString = second.getKey();
+                HashMap<String, HashSet<String>> thirdMap = second.getValue();
+
+                for (Map.Entry<String, HashSet<String>> third : thirdMap.entrySet()) {
+                    String thirdString = third.getKey();
+                    HashSet<String> fourthSet = third.getValue();
+
+                    if (fourthSet.contains("*")) {
+                        set.add(firstString + "." + secondString + "." + thirdString);
+                    } else {
+                        for (String fourth : fourthSet) {
+                            set.add(firstString + "." + secondString + "." + thirdString + "." + fourth);
+                        }
+                    }
+
+                }
+            }
+        }
+
+        return set;
+    }
+
+
+    /**
+     * Exception Class to deal with IPFormat errors.
+     */
+    public class IPFormatException extends Exception {
+        public IPFormatException(String s) {
+            super(s);
+        }
+    }
+
+
+}
+
+
--- a/dspace-stats/src/main/java/org/dspace/statistics/util/SpiderDetector.java
+++ b/dspace-stats/src/main/java/org/dspace/statistics/util/SpiderDetector.java
@@ -10,123 +10,170 @@
 */
 package org.dspace.statistics.util;

-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.PosixParser;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.CommandLine;
+import org.apache.log4j.Logger;
+import org.dspace.core.ConfigurationManager;
+import org.dspace.statistics.SolrLogger;

-import java.io.*;
-import java.util.Vector;
+import javax.servlet.http.HttpServletRequest;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;

 /**
+ * SpiderDetector is used to find IP's that are spiders...
+ * In future someone may add UserAgents and Host Domains
+ * to the detection criteria here.
+ *
 * @author kevinvandevelde at atmire.com
 * @author ben at atmire.com
+ * @author Mark Diggory (mdiggory at atmire.com)
 */
 public class SpiderDetector {

+    private static Logger log = Logger.getLogger(SpiderDetector.class);
+
    /**
-     * Creates a file containing spiders based on an apache logfile
-     * by analyzing users of the robots.txt file
-     * @param args
-     * @throws Exception
+     * Sparse HAshTable structure to hold IP Address Ranges.
     */
-    public static void main(String[] args) throws Exception{
-// create an options object and populate it
-        CommandLineParser parser = new PosixParser();
-
-        Options options = new Options();
-        options.addOption("l", "logfile", true, "type: Input log file");
-        options.addOption("s", "spiderfile", true, "type: Spider ip file");
-
-        CommandLine line = parser.parse(options, args);
-
-        String logFileLoc;
-        String spiderIpPath;
-        if(line.hasOption("l"))
-            logFileLoc = line.getOptionValue("l");
-        else{
-            System.out.println("We need our log file");
-            return;
-        }
-        if(line.hasOption("s"))
-            spiderIpPath = line.getOptionValue("s");
-        else{
-            System.out.println("We need a spider ip output file");
-            return;
-        }
-
-        //First read in our log file line per line
-        BufferedReader in = new BufferedReader(new FileReader(logFileLoc));
-        Vector<String> spiders = new Vector<String>();
-        String logLine;
-        while ((logLine = in.readLine()) != null){
-            //Currently only check if robot.txt is present in our line
-            if(logLine.contains("robots.txt")){
-                //We got a robots.txt so we got a bot
-                String ip = logLine.substring(0, logLine.indexOf("-")).trim();
-                //Only add single ip addresses once we got it in it is enough
-                if(!spiders.contains(ip))
-                    spiders.add(ip);
-            }
-        }
-        in.close();
-
-        //Get the output file
-        File spiderIpFile = new File(spiderIpPath);
-        //Get the ip's already added in our file
-        Vector<String> oldSpiderIds = new Vector<String>();
-        if(spiderIpFile.exists())
-            oldSpiderIds = readIpAddresses(spiderIpFile);
-
-        Vector<String> newSpiderIds = new Vector<String>();
-
-        //Now run over all these naughty spiders & add em to our overview file
-        //PS: only add them if not present
-        for (int i = 0; i < spiders.size(); i++) {
-            String spiderIp = spiders.elementAt(i);
-            if(!oldSpiderIds.contains(spiderIp))
-                newSpiderIds.add(spiderIp);
-        }
-
-        //Last but not least add the ips to our file
-        BufferedWriter output = new BufferedWriter(new FileWriter(spiderIpFile));
-        //First write the old ips back so we don't lose any
-        for (int i = 0; i < oldSpiderIds.size(); i++) {
-            String ip = oldSpiderIds.elementAt(i);
-            output.write(ip + "\n");
-        }
-
-        //Second write the new ips
-        for (int i = 0; i < newSpiderIds.size(); i++) {
-            String ip = newSpiderIds.elementAt(i);
-            System.out.println("Adding new ip: " + ip);
-            //Write each new ip on a seperate line
-            output.write(ip + "\n");
-        }
-
-        output.flush();
-        output.close();
-    }
-
+    private static IPTable table = null;

    /**
-     * Reads the ip addresses out a file & returns them in a vector
+     * Utility method which Reads the ip addresses out a file & returns them in a Set
+     *
     * @param spiderIpFile the location of our spider file
     * @return a vector full of ip's
     * @throws IOException could not happen since we check the file be4 we use it
     */
-    public static Vector<String> readIpAddresses(File spiderIpFile) throws IOException {
-        Vector<String> ips = new Vector<String>();
-        if(!spiderIpFile.exists())
+    public static HashSet<String> readIpAddresses(File spiderIpFile) throws IOException {
+        HashSet<String> ips = new HashSet<String>();
+
+        if (!spiderIpFile.exists() || !spiderIpFile.isFile())
            return ips;

        //Read our file & get all them ip's
        BufferedReader in = new BufferedReader(new FileReader(spiderIpFile));
-        String ip;
-        while((ip = in.readLine()) != null){
-            ips.add(ip);
+        String line;
+        while ((line = in.readLine()) != null) {
+            if (!line.startsWith("#")) {
+                line = line.trim();
+
+                if (!line.equals("") && !Character.isDigit(line.charAt(0))) {
+                    // is a hostname
+                    // add this functionality later...
+                } else if (!line.equals("")) {
+                    ips.add(line);
+                    // is full v4 ip (too tired to deal with v6)...
+                }
+            } else {
+                //   ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim());
+                // ... add this functionality later
+            }
        }
        in.close();
        return ips;
    }
+
+    /**
+     * Get an immutable Set representing all the Spider Addresses here
+     *
+     * @return
+     */
+    public static Set<String> getSpiderIpAddresses() {
+
+        loadSpiderIpAddresses();
+        return table.toSet();
+    }
+
+    /*
+        private loader to populate the table from files.
+     */
+
+    private static void loadSpiderIpAddresses() {
+
+
+        if (table == null) {
+            table = new IPTable();
+
+            String filePath = ConfigurationManager.getProperty("dspace.dir");
+
+            try {
+                File spidersDir = new File(filePath, "config/spiders");
+
+                if (spidersDir.exists() && spidersDir.isDirectory()) {
+                    for (File file : spidersDir.listFiles()) {
+                        for (String ip : readIpAddresses(file)) {
+                            table.add(ip);
+                        }
+                        log.info("Loaded Spider IP file: " + file);
+                    }
+                } else {
+                    log.info("No spider file loaded");
+                }
+
+
+            }
+            catch (Exception e) {
+                log.error("Error Loading Spiders:" + e.getMessage(), e);
+            }
+
+
+        }
+
+    }
+
+
+    /**
+     * Static Service Method for testing spiders against existing spider files.
+     * <p/>
+     * In the future this will be extended to support User Agent and
+     * domain Name detection.
+     * <p/>
+     * In future spiders HashSet may be optimized as byte offset array to
+     * improve performance and memory footprint further.
+     *
+     * @param request
+     * @return true|false if the request was detected to be from a spider
+     */
+    public static boolean isSpider(HttpServletRequest request) {
+
+        if (SolrLogger.isUseProxies() && request.getHeader("X-Forwarded-For") != null) {
+            /* This header is a comma delimited list */
+            for (String xfip : request.getHeader("X-Forwarded-For").split(",")) {
+                if (isSpider(xfip))
+                    return true;
+            }
+        }
+
+        return isSpider(request.getRemoteAddr());
+
+    }
+
+    /**
+     * Check individual IP is a spider.
+     *
+     * @param ip
+     * @return if is spider IP
+     */
+    public static boolean isSpider(String ip) {
+
+        if (table == null) {
+            SpiderDetector.loadSpiderIpAddresses();
+        }
+
+        try {
+            if (table.contains(ip)) {
+                return true;
+            }
+        } catch (Exception e) {
+            return false;
+        }
+
+        return false;
+
+
+    }
+
 }
--- a/dspace/config/dspace.cfg
+++ b/dspace/config/dspace.cfg
@@ -2009,8 +2009,30 @@ harvester.unknownSchema = fail

 ##### Usage Logging #####
 solr.log.server = ${dspace.baseUrl}/solr/statistics
-solr.spidersfile = ${dspace.dir}/config/spiders.txt
 solr.dbfile = ${dspace.dir}/config/GeoLiteCity.dat
-useProxies = true

-statistics.item.authorization.admin=true
+# If enabled the statistics system will look for an X-Forward header
+# if it finds it, it will use this for the user IP Addrress
+# it is enabled by default
+# useProxies = true
+
+# Control if the statistics pages should be only shown to authorized users
+statistics.item.authorization.admin=true
+
+# control solr statistics querying to filter out spider IPs
+# false by default
+# solr.statistics.query.filter.spiderIp = false
+
+# control slor statistics querying to look at "isBot" field to determine
+# if record is a bot. true by default.
+# solr.statistics.query.filter.isBot = true
+
+# URLs to download IP addresses of search engine spiders from
+solr.spiderips.urls = http://iplists.com/google.txt, \
+                      http://iplists.com/inktomi.txt, \
+                      http://iplists.com/lycos.txt, \
+                      http://iplists.com/infoseek.txt, \
+                      http://iplists.com/altavista.txt, \
+                      http://iplists.com/excite.txt, \
+                      http://iplists.com/misc.txt, \
+                      http://iplists.com/non_engines.txt
--- a/dspace/config/launcher.xml
+++ b/dspace/config/launcher.xml
@@ -301,4 +301,12 @@
        </step>
    </command>

+    <command>
+        <name>update-spider-ips</name>
+        <description>Update the list of known search engine IP addresses</description>
+        <step>
+            <class>org.dspace.statistics.util.DownloadSpiderIPs</class>
+        </step>
+    </command>
+
 </commands>
--- a/dspace/config/spiders.txt
+++ b/dspace/config/spiders.txt
--- a/dspace/solr/statistics/conf/schema.xml
+++ b/dspace/solr/statistics/conf/schema.xml
@@ -289,7 +289,10 @@
   <field name="owningComm" type="integer" indexed="true" stored="true" required="false" multiValued="true" /> 
   <field name="owningColl" type="integer" indexed="true" stored="true" required="false" multiValued="true" /> 
   <field name="owningItem" type="integer" indexed="true" stored="true" required="false" multiValued="true" /> 
-  <field name="dns" type="string" indexed="true" stored="true" required="false"/>
+   <field name="dns" type="string" indexed="true" stored="true" required="false"/>
+   <field name="userAgent" type="string" indexed="true" stored="true" required="false"/>
+   <field name="isBot" type="boolean" indexed="true" stored="true" required="false"/>
+
 </fields>