[DS-440] Adjust SolrLogger and rest of Statistics system to support processing multiple statistics files. Prempt logging spider IPs and prune spider IPs from Solr with utility methods.

git-svn-id: http://scm.dspace.org/svn/repo/dspace/trunk@4745 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
Mark Diggory
2010-02-07 17:44:17 +00:00
parent a5beae59c2
commit 73edd7a585
9 changed files with 686 additions and 176 deletions

View File

@@ -10,18 +10,8 @@
*/
package org.dspace.statistics;
import java.io.File;
import java.io.IOException;
import java.sql.SQLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import com.maxmind.geoip.Location;
import com.maxmind.geoip.LookupService;
import org.apache.commons.lang.time.DateFormatUtils;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrQuery;
@@ -33,21 +23,20 @@ import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.dspace.content.Bitstream;
import org.dspace.content.Bundle;
import org.dspace.content.*;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.content.DCValue;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.core.ConfigurationManager;
import org.dspace.eperson.EPerson;
import org.dspace.statistics.util.DnsLookup;
import org.dspace.statistics.util.LocationUtils;
import org.dspace.statistics.util.SpiderDetector;
import com.maxmind.geoip.Location;
import com.maxmind.geoip.LookupService;
import javax.servlet.http.HttpServletRequest;
import java.io.IOException;
import java.sql.SQLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* Static SolrLogger used to hold HttpSolrClient connection pool to issue
@@ -55,6 +44,7 @@ import com.maxmind.geoip.LookupService;
*
* @author ben at atmire.com
* @author kevinvandevelde at atmire.com
* @author mdiggory at atmire.com
*/
public class SolrLogger
{
@@ -69,8 +59,6 @@ public class SolrLogger
private static final LookupService locationService;
private static final Vector<String> spiderIps;
private static final boolean useProxies;
private static Map metadataStorageInfo;
@@ -80,7 +68,6 @@ public class SolrLogger
log.info("solr.spidersfile:" + ConfigurationManager.getProperty("solr.spidersfile"));
log.info("solr.log.server:" + ConfigurationManager.getProperty("solr.log.server"));
log.info("solr.dbfile:" + ConfigurationManager.getProperty("solr.dbfile"));
log.info("spiders file:" + ConfigurationManager.getProperty("solr.spidersfile"));
CommonsHttpSolrServer server = null;
@@ -99,18 +86,7 @@ public class SolrLogger
solr = server;
// Read in the file so we don't have to do it all the time
Vector<String> spiderIpsLoc;
String filePath = ConfigurationManager.getProperty("solr.spidersfile");
try
{
spiderIpsLoc = SpiderDetector.readIpAddresses(new File(filePath));
}
catch (Exception e)
{
spiderIpsLoc = new Vector<String>();
e.printStackTrace(); // Should never happen
}
spiderIps = spiderIpsLoc;
//spiderIps = SpiderDetector.getSpiderIpAddresses();
LookupService service = null;
// Get the db file for the location
@@ -154,16 +130,47 @@ public class SolrLogger
}
}
public static void post(DSpaceObject dspaceObject, String ip,
public static void post(DSpaceObject dspaceObject, HttpServletRequest request,
EPerson currentUser)
{
if (solr == null || locationService == null)
return;
boolean isSpiderBot = SpiderDetector.isSpider(request);
try
{
if(isSpiderBot &&
!ConfigurationManager.getBooleanProperty("solr.statistics.logBots",true))
{
return;
}
SolrInputDocument doc1 = new SolrInputDocument();
// Save our basic info that we already have
String ip = request.getRemoteAddr();
if(isUseProxies() && request.getHeader("X-Forwarded-For") != null)
{
/* This header is a comma delimited list */
for(String xfip : request.getHeader("X-Forwarded-For").split(","))
{
/* proxy itself will sometime populate this header with the same value in
remote address. ordering in spec is vague, we'll just take the last
not equal to the proxy
*/
if(!request.getHeader("X-Forwarded-For").contains(ip))
{
ip = xfip.trim();
}
}
}
doc1.addField("ip", ip);
doc1.addField("id", dspaceObject.getID());
doc1.addField("type", dspaceObject.getType());
// Save the current time
@@ -203,7 +210,12 @@ public class SolrLogger
doc1.addField("city", location.city);
doc1.addField("latitude", location.latitude);
doc1.addField("longitude", location.longitude);
doc1.addField("isBot",isSpiderBot);
if(request.getHeader("User-Agent") != null)
doc1.addField("userAgent", request.getHeader("User-Agent"));
}
if (dspaceObject instanceof Item)
{
Item item = (Item) dspaceObject;
@@ -226,6 +238,7 @@ public class SolrLogger
}
}
storeParents(doc1, dspaceObject);
solr.add(doc1);
@@ -355,6 +368,136 @@ public class SolrLogger
return currentValsStored;
}
public static class ResultProcessor
{
public void execute(String query) throws SolrServerException, IOException {
Map<String, String> params = new HashMap<String, String>();
params.put("q", query);
params.put("rows", "10");
MapSolrParams solrParams = new MapSolrParams(params);
QueryResponse response = solr.query(solrParams);
long numbFound = response.getResults().getNumFound();
// process the first batch
process(response.getResults());
// Run over the rest
for (int i = 10; i < numbFound; i += 10)
{
params.put("start", String.valueOf(i));
solrParams = new MapSolrParams(params);
response = solr.query(solrParams);
process(response.getResults());
}
}
public void commit() throws IOException, SolrServerException {
solr.commit();
}
/**
* Override to manage pages of documents
* @param docs
*/
public void process(List<SolrDocument> docs) throws IOException, SolrServerException {
for(SolrDocument doc : docs){
process(doc);
}
}
/**
* Overide to manage individual documents
* @param doc
*/
public void process(SolrDocument doc) throws IOException, SolrServerException {
}
}
public static void markRobotsByIP()
{
for(String ip : SpiderDetector.getSpiderIpAddresses()){
try {
/* Result Process to alter record to be identified as a bot */
ResultProcessor processor = new ResultProcessor(){
public void process(SolrDocument doc) throws IOException, SolrServerException {
doc.removeFields("isBot");
doc.addField("isBot", true);
SolrInputDocument newInput = ClientUtils.toSolrInputDocument(doc);
solr.add(newInput);
}
};
/* query for ip, exclude results previously set as bots. */
processor.execute("ip:"+ip+ "* AND NOT isBot:true");
solr.commit();
} catch (Exception e) {
log.error(e.getMessage(),e);
}
}
}
public static void markRobotByUserAgent(String agent){
try {
/* Result Process to alter record to be identified as a bot */
ResultProcessor processor = new ResultProcessor(){
public void process(SolrDocument doc) throws IOException, SolrServerException {
doc.removeFields("isBot");
doc.addField("isBot", true);
SolrInputDocument newInput = ClientUtils.toSolrInputDocument(doc);
solr.add(newInput);
}
};
/* query for ip, exclude results previously set as bots. */
processor.execute("userAgent:"+agent+ " AND NOT isBot:true");
solr.commit();
} catch (Exception e) {
log.error(e.getMessage(),e);
}
}
public static void deleteRobotsByIsBotFlag()
{
try {
solr.deleteByQuery("isBot:true");
} catch (Exception e) {
log.error(e.getMessage(),e);
}
}
public static void deleteIP(String ip)
{
try {
solr.deleteByQuery("ip:"+ip + "*");
} catch (Exception e) {
log.error(e.getMessage(),e);
}
}
public static void deleteRobotsByIP()
{
for(String ip : SpiderDetector.getSpiderIpAddresses()){
deleteIP(ip);
}
}
/*
* //TODO: below are not used public static void
* update(String query, boolean addField, String fieldName, Object
@@ -372,25 +515,18 @@ public class SolrLogger
// We need to get our documents
// QueryResponse queryResponse = solr.query()//query(query, null, -1,
// null, null, null);
Map<String, String> params = new HashMap<String, String>();
params.put("q", query);
params.put("rows", "10");
MapSolrParams solrParams = new MapSolrParams(params);
QueryResponse response = solr.query(solrParams);
long numbFound = response.getResults().getNumFound();
List<SolrDocument> docsToUpdate = new ArrayList<SolrDocument>();
docsToUpdate.addAll(response.getResults());
final List<SolrDocument> docsToUpdate = new ArrayList<SolrDocument>();
// Run over the rest
for (int i = 10; i < numbFound; i += 10)
{
params.put("start", String.valueOf(i));
solrParams = new MapSolrParams(params);
response = solr.query(solrParams);
docsToUpdate.addAll(response.getResults());
}
// We have all the docs delete the once we don't need
ResultProcessor processor = new ResultProcessor(){
public void process(List<SolrDocument> docs) throws IOException, SolrServerException {
docsToUpdate.addAll(docs);
}
};
processor.execute(query);
// We have all the docs delete the ones we don't need
solr.deleteByQuery(query);
// Add the new (updated onces
@@ -680,8 +816,17 @@ public class SolrLogger
// A filter is used instead of a regular query to improve
// performance and ensure the search result ordering will
// not be influenced
solrQuery.addFilterQuery(getIgnoreSpiders());
// not be influenced
// Choose to filter by the Legacy spider IP list (may get too long to properly filter all IP's
if(ConfigurationManager.getBooleanProperty("solr.statistics.query.filter.spiderIp",false))
solrQuery.addFilterQuery(getIgnoreSpiderIPs());
// Choose to filter by isBot field, may be overriden in future
// to allow views on stats based on bots.
if(ConfigurationManager.getBooleanProperty("solr.statistics.query.filter.isBot",true))
solrQuery.addFilterQuery("-isBot:true");
if (filterQuery != null)
solrQuery.addFilterQuery(filterQuery);
@@ -699,20 +844,32 @@ public class SolrLogger
return response;
}
/** String of IP and Ranges in IPTable as a Solr Query */
private static String filterQuery = null;
/**
* Returns in a query string all the ip addresses that should be ignored
*
* Returns in a filterQuery string all the ip addresses that should be ignored
*
* @return a string query with ip addresses
*/
private static String getIgnoreSpiders()
{
String query = "";
for (int i = 0; i < spiderIps.size(); i++)
{
String ip = spiderIps.elementAt(i);
public static String getIgnoreSpiderIPs() {
if (filterQuery == null) {
String query = "";
boolean first = true;
for (String ip : SpiderDetector.getSpiderIpAddresses()) {
if (first) {
query += " AND ";
first = false;
}
query += (i != 0 ? " AND " : "") + "NOT(ip: " + ip + ")";
query += " NOT(ip: " + ip + ")";
}
filterQuery = query;
}
return query;
return filterQuery;
}
}

View File

@@ -13,6 +13,7 @@ package org.dspace.statistics;
import org.apache.log4j.Logger;
import org.dspace.eperson.EPerson;
import org.dspace.services.model.Event;
import org.dspace.statistics.util.SpiderDetector;
import org.dspace.usage.AbstractUsageEventListener;
import org.dspace.usage.UsageEvent;
@@ -33,21 +34,12 @@ public class SolrLoggerUsageEventListener extends AbstractUsageEventListener {
{
try{
UsageEvent ue = (UsageEvent)event;
UsageEvent ue = (UsageEvent)event;
String ip = null;
if(SolrLogger.isUseProxies())
ip = ue.getRequest().getHeader("X-Forwarded-For");
if(ip == null || ip.equals(""))
ip = ue.getRequest().getRemoteAddr();
EPerson currentUser = ue.getContext() == null ? null : ue.getContext().getCurrentUser();
EPerson currentUser = ue.getContext() == null ? null : ue.getContext().getCurrentUser();
SolrLogger.post(ue.getObject(), ue.getRequest(), currentUser);
SolrLogger.post(ue.getObject(), ip, currentUser);
}
catch(Exception e)
{

View File

@@ -0,0 +1,97 @@
/**
* $Id: $
* $URL: $
* *************************************************************************
* Copyright (c) 2002-2009, DuraSpace. All rights reserved
* Licensed under the DuraSpace Foundation License.
*
* A copy of the DuraSpace License has been included in this
* distribution and is available at: http://scm.dspace.org/svn/repo/licenses/LICENSE.txt
*/
package org.dspace.statistics.util;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import java.io.*;
import java.util.HashSet;
/**
* @author Mark Diggory (mdiggory at atmire.com)
* @author kevinvandevelde at atmire.com
* @author ben at atmire.com
*/
public class ApacheLogRobotsProcessor {
/**
* Creates a file containing spiders based on an apache logfile
* by analyzing users of the robots.txt file
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// create an options object and populate it
CommandLineParser parser = new PosixParser();
Options options = new Options();
options.addOption("l", "logfile", true, "type: Input log file");
options.addOption("s", "spiderfile", true, "type: Spider ip file");
CommandLine line = parser.parse(options, args);
String logFileLoc;
String spiderIpPath;
if (line.hasOption("l"))
logFileLoc = line.getOptionValue("l");
else {
System.out.println("We need our log file");
return;
}
if (line.hasOption("s"))
spiderIpPath = line.getOptionValue("s");
else {
System.out.println("We need a spider ip output file");
return;
}
File spiderIpFile = new File(spiderIpPath);
//Get the ip's already added in our file
HashSet<String> logSpiders = new HashSet<String>();
if (spiderIpFile.exists())
logSpiders = SpiderDetector.readIpAddresses(spiderIpFile);
//First read in our log file line per line
BufferedReader in = new BufferedReader(new FileReader(logFileLoc));
String logLine;
while ((logLine = in.readLine()) != null) {
//Currently only check if robot.txt is present in our line
if (logLine.contains("robots.txt")) {
//We got a robots.txt so we got a bot
String ip = logLine.substring(0, logLine.indexOf("-")).trim();
//Only add single ip addresses once we got it in it is enough
logSpiders.add(ip);
}
}
in.close();
//Last but not least add the ips to our file
BufferedWriter output = new BufferedWriter(new FileWriter(spiderIpFile));
//Second write the new ips
for (String ip : logSpiders) {
System.out.println("Adding new ip: " + ip);
//Write each new ip on a seperate line
output.write(ip + "\n");
}
output.flush();
output.close();
}
}

View File

@@ -0,0 +1,184 @@
/**
* $Id: $
* $URL: $
* *************************************************************************
* Copyright (c) 2002-2009, DuraSpace. All rights reserved
* Licensed under the DuraSpace Foundation License.
*
* A copy of the DuraSpace License has been included in this
* distribution and is available at: http://scm.dspace.org/svn/repo/licenses/LICENSE.txt
*/
package org.dspace.statistics.util;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* A Spare v4 IPTable implementation that uses nested HashMaps
* TO optimize IP Address matching over ranges of IP Addresses.
*
* @author: mdiggory at atmire.com
*/
public class IPTable {
/* A lookup tree for IP Addresses and SubnetRanges */
private HashMap<String, HashMap<String, HashMap<String, HashSet<String>>>> map =
new HashMap<String, HashMap<String, HashMap<String, HashSet<String>>>>();
/**
* Can be full v4 IP, subnet or range string
*
* @param ip
*/
public void add(String ip) throws IPFormatException {
String[] start;
String[] end;
String[] range = ip.split("-");
if (range.length >= 2) {
start = range[0].trim().split("/")[0].split("\\.");
end = range[1].trim().split("/")[0].split("\\.");
if (start.length != 4 || end.length != 4)
throw new IPFormatException(ip + " - Ranges need to be full IPv4 Addresses");
if (!(start[0].equals(end[0]) && start[1].equals(end[1]) && start[2].equals(end[2]))) {
throw new IPFormatException(ip + " - Ranges can only be across the last subnet x.y.z.0 - x.y.z.254");
}
} else {
//need to ignore CIDR notation for the moment.
//ip = ip.split("\\/")[0];
String[] subnets = ip.split("\\.");
if (subnets.length < 3) {
throw new IPFormatException(ip + " - require at least three subnet places (255.255.255.0");
}
start = subnets;
end = subnets;
}
if (start.length >= 3) {
HashMap<String, HashMap<String, HashSet<String>>> first = map.get(start[0]);
if (first == null) {
first = new HashMap<String, HashMap<String, HashSet<String>>>();
map.put(start[0], first);
}
HashMap<String, HashSet<String>> second = first.get(start[1]);
if (second == null) {
second = new HashMap<String, HashSet<String>>();
first.put(start[1], second);
}
HashSet<String> third = second.get(start[2]);
if (third == null) {
third = new HashSet<String>();
second.put(start[2], third);
}
//now populate fourth place (* or value 0-254);
if (start.length == 3) {
third.add("*");
}
if (third.contains("*")) {
return;
}
if (start.length >= 4) {
int s = Integer.valueOf(start[3]);
int e = Integer.valueOf(end[3]);
for (int i = s; i <= e; i++) {
third.add(String.valueOf(i));
}
}
}
}
public boolean contains(String ip) throws IPFormatException {
String[] subnets = ip.split("\\.");
if (subnets.length != 4)
throw new IPFormatException("needs to be single IP Address");
HashMap<String, HashMap<String, HashSet<String>>> first = map.get(subnets[0]);
if (first == null) return false;
HashMap<String, HashSet<String>> second = first.get(subnets[1]);
if (second == null) return false;
HashSet<String> third = second.get(subnets[2]);
if (third == null) return false;
return third.contains(subnets[3]) || third.contains("*");
}
/**
* @return
*/
public Set<String> toSet() {
HashSet<String> set = new HashSet<String>();
for (Map.Entry<String, HashMap<String, HashMap<String, HashSet<String>>>> first : map.entrySet()) {
String firstString = first.getKey();
HashMap<String, HashMap<String, HashSet<String>>> secondMap = first.getValue();
for (Map.Entry<String, HashMap<String, HashSet<String>>> second : secondMap.entrySet()) {
String secondString = second.getKey();
HashMap<String, HashSet<String>> thirdMap = second.getValue();
for (Map.Entry<String, HashSet<String>> third : thirdMap.entrySet()) {
String thirdString = third.getKey();
HashSet<String> fourthSet = third.getValue();
if (fourthSet.contains("*")) {
set.add(firstString + "." + secondString + "." + thirdString);
} else {
for (String fourth : fourthSet) {
set.add(firstString + "." + secondString + "." + thirdString + "." + fourth);
}
}
}
}
}
return set;
}
/**
* Exception Class to deal with IPFormat errors.
*/
public class IPFormatException extends Exception {
public IPFormatException(String s) {
super(s);
}
}
}

View File

@@ -10,123 +10,170 @@
*/
package org.dspace.statistics.util;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.CommandLine;
import org.apache.log4j.Logger;
import org.dspace.core.ConfigurationManager;
import org.dspace.statistics.SolrLogger;
import java.io.*;
import java.util.Vector;
import javax.servlet.http.HttpServletRequest;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
/**
* SpiderDetector is used to find IP's that are spiders...
* In future someone may add UserAgents and Host Domains
* to the detection criteria here.
*
* @author kevinvandevelde at atmire.com
* @author ben at atmire.com
* @author Mark Diggory (mdiggory at atmire.com)
*/
public class SpiderDetector {
private static Logger log = Logger.getLogger(SpiderDetector.class);
/**
* Creates a file containing spiders based on an apache logfile
* by analyzing users of the robots.txt file
* @param args
* @throws Exception
* Sparse HAshTable structure to hold IP Address Ranges.
*/
public static void main(String[] args) throws Exception{
// create an options object and populate it
CommandLineParser parser = new PosixParser();
Options options = new Options();
options.addOption("l", "logfile", true, "type: Input log file");
options.addOption("s", "spiderfile", true, "type: Spider ip file");
CommandLine line = parser.parse(options, args);
String logFileLoc;
String spiderIpPath;
if(line.hasOption("l"))
logFileLoc = line.getOptionValue("l");
else{
System.out.println("We need our log file");
return;
}
if(line.hasOption("s"))
spiderIpPath = line.getOptionValue("s");
else{
System.out.println("We need a spider ip output file");
return;
}
//First read in our log file line per line
BufferedReader in = new BufferedReader(new FileReader(logFileLoc));
Vector<String> spiders = new Vector<String>();
String logLine;
while ((logLine = in.readLine()) != null){
//Currently only check if robot.txt is present in our line
if(logLine.contains("robots.txt")){
//We got a robots.txt so we got a bot
String ip = logLine.substring(0, logLine.indexOf("-")).trim();
//Only add single ip addresses once we got it in it is enough
if(!spiders.contains(ip))
spiders.add(ip);
}
}
in.close();
//Get the output file
File spiderIpFile = new File(spiderIpPath);
//Get the ip's already added in our file
Vector<String> oldSpiderIds = new Vector<String>();
if(spiderIpFile.exists())
oldSpiderIds = readIpAddresses(spiderIpFile);
Vector<String> newSpiderIds = new Vector<String>();
//Now run over all these naughty spiders & add em to our overview file
//PS: only add them if not present
for (int i = 0; i < spiders.size(); i++) {
String spiderIp = spiders.elementAt(i);
if(!oldSpiderIds.contains(spiderIp))
newSpiderIds.add(spiderIp);
}
//Last but not least add the ips to our file
BufferedWriter output = new BufferedWriter(new FileWriter(spiderIpFile));
//First write the old ips back so we don't lose any
for (int i = 0; i < oldSpiderIds.size(); i++) {
String ip = oldSpiderIds.elementAt(i);
output.write(ip + "\n");
}
//Second write the new ips
for (int i = 0; i < newSpiderIds.size(); i++) {
String ip = newSpiderIds.elementAt(i);
System.out.println("Adding new ip: " + ip);
//Write each new ip on a seperate line
output.write(ip + "\n");
}
output.flush();
output.close();
}
private static IPTable table = null;
/**
* Reads the ip addresses out a file & returns them in a vector
* Utility method which Reads the ip addresses out a file & returns them in a Set
*
* @param spiderIpFile the location of our spider file
* @return a vector full of ip's
* @throws IOException could not happen since we check the file be4 we use it
*/
public static Vector<String> readIpAddresses(File spiderIpFile) throws IOException {
Vector<String> ips = new Vector<String>();
if(!spiderIpFile.exists())
public static HashSet<String> readIpAddresses(File spiderIpFile) throws IOException {
HashSet<String> ips = new HashSet<String>();
if (!spiderIpFile.exists() || !spiderIpFile.isFile())
return ips;
//Read our file & get all them ip's
BufferedReader in = new BufferedReader(new FileReader(spiderIpFile));
String ip;
while((ip = in.readLine()) != null){
ips.add(ip);
String line;
while ((line = in.readLine()) != null) {
if (!line.startsWith("#")) {
line = line.trim();
if (!line.equals("") && !Character.isDigit(line.charAt(0))) {
// is a hostname
// add this functionality later...
} else if (!line.equals("")) {
ips.add(line);
// is full v4 ip (too tired to deal with v6)...
}
} else {
// ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim());
// ... add this functionality later
}
}
in.close();
return ips;
}
/**
* Get an immutable Set representing all the Spider Addresses here
*
* @return
*/
public static Set<String> getSpiderIpAddresses() {
loadSpiderIpAddresses();
return table.toSet();
}
/*
private loader to populate the table from files.
*/
private static void loadSpiderIpAddresses() {
if (table == null) {
table = new IPTable();
String filePath = ConfigurationManager.getProperty("dspace.dir");
try {
File spidersDir = new File(filePath, "config/spiders");
if (spidersDir.exists() && spidersDir.isDirectory()) {
for (File file : spidersDir.listFiles()) {
for (String ip : readIpAddresses(file)) {
table.add(ip);
}
log.info("Loaded Spider IP file: " + file);
}
} else {
log.info("No spider file loaded");
}
}
catch (Exception e) {
log.error("Error Loading Spiders:" + e.getMessage(), e);
}
}
}
/**
* Static Service Method for testing spiders against existing spider files.
* <p/>
* In the future this will be extended to support User Agent and
* domain Name detection.
* <p/>
* In future spiders HashSet may be optimized as byte offset array to
* improve performance and memory footprint further.
*
* @param request
* @return true|false if the request was detected to be from a spider
*/
public static boolean isSpider(HttpServletRequest request) {
if (SolrLogger.isUseProxies() && request.getHeader("X-Forwarded-For") != null) {
/* This header is a comma delimited list */
for (String xfip : request.getHeader("X-Forwarded-For").split(",")) {
if (isSpider(xfip))
return true;
}
}
return isSpider(request.getRemoteAddr());
}
/**
* Check individual IP is a spider.
*
* @param ip
* @return if is spider IP
*/
public static boolean isSpider(String ip) {
if (table == null) {
SpiderDetector.loadSpiderIpAddresses();
}
try {
if (table.contains(ip)) {
return true;
}
} catch (Exception e) {
return false;
}
return false;
}
}

View File

@@ -2009,8 +2009,30 @@ harvester.unknownSchema = fail
##### Usage Logging #####
solr.log.server = ${dspace.baseUrl}/solr/statistics
solr.spidersfile = ${dspace.dir}/config/spiders.txt
solr.dbfile = ${dspace.dir}/config/GeoLiteCity.dat
useProxies = true
statistics.item.authorization.admin=true
# If enabled the statistics system will look for an X-Forward header
# if it finds it, it will use this for the user IP Addrress
# it is enabled by default
# useProxies = true
# Control if the statistics pages should be only shown to authorized users
statistics.item.authorization.admin=true
# control solr statistics querying to filter out spider IPs
# false by default
# solr.statistics.query.filter.spiderIp = false
# control slor statistics querying to look at "isBot" field to determine
# if record is a bot. true by default.
# solr.statistics.query.filter.isBot = true
# URLs to download IP addresses of search engine spiders from
solr.spiderips.urls = http://iplists.com/google.txt, \
http://iplists.com/inktomi.txt, \
http://iplists.com/lycos.txt, \
http://iplists.com/infoseek.txt, \
http://iplists.com/altavista.txt, \
http://iplists.com/excite.txt, \
http://iplists.com/misc.txt, \
http://iplists.com/non_engines.txt

View File

@@ -301,4 +301,12 @@
</step>
</command>
<command>
<name>update-spider-ips</name>
<description>Update the list of known search engine IP addresses</description>
<step>
<class>org.dspace.statistics.util.DownloadSpiderIPs</class>
</step>
</command>
</commands>

View File

@@ -289,7 +289,10 @@
<field name="owningComm" type="integer" indexed="true" stored="true" required="false" multiValued="true" />
<field name="owningColl" type="integer" indexed="true" stored="true" required="false" multiValued="true" />
<field name="owningItem" type="integer" indexed="true" stored="true" required="false" multiValued="true" />
<field name="dns" type="string" indexed="true" stored="true" required="false"/>
<field name="dns" type="string" indexed="true" stored="true" required="false"/>
<field name="userAgent" type="string" indexed="true" stored="true" required="false"/>
<field name="isBot" type="boolean" indexed="true" stored="true" required="false"/>
</fields>