mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 01:54:22 +00:00
[DS-440] Adjust SolrLogger and rest of Statistics system to support processing multiple statistics files. Prempt logging spider IPs and prune spider IPs from Solr with utility methods.
git-svn-id: http://scm.dspace.org/svn/repo/dspace/trunk@4745 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
@@ -10,18 +10,8 @@
|
||||
*/
|
||||
package org.dspace.statistics;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Vector;
|
||||
|
||||
import com.maxmind.geoip.Location;
|
||||
import com.maxmind.geoip.LookupService;
|
||||
import org.apache.commons.lang.time.DateFormatUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
@@ -33,21 +23,20 @@ import org.apache.solr.client.solrj.util.ClientUtils;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.MapSolrParams;
|
||||
import org.dspace.content.Bitstream;
|
||||
import org.dspace.content.Bundle;
|
||||
import org.dspace.content.*;
|
||||
import org.dspace.content.Collection;
|
||||
import org.dspace.content.Community;
|
||||
import org.dspace.content.DCValue;
|
||||
import org.dspace.content.DSpaceObject;
|
||||
import org.dspace.content.Item;
|
||||
import org.dspace.core.ConfigurationManager;
|
||||
import org.dspace.eperson.EPerson;
|
||||
import org.dspace.statistics.util.DnsLookup;
|
||||
import org.dspace.statistics.util.LocationUtils;
|
||||
import org.dspace.statistics.util.SpiderDetector;
|
||||
|
||||
import com.maxmind.geoip.Location;
|
||||
import com.maxmind.geoip.LookupService;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Static SolrLogger used to hold HttpSolrClient connection pool to issue
|
||||
@@ -55,6 +44,7 @@ import com.maxmind.geoip.LookupService;
|
||||
*
|
||||
* @author ben at atmire.com
|
||||
* @author kevinvandevelde at atmire.com
|
||||
* @author mdiggory at atmire.com
|
||||
*/
|
||||
public class SolrLogger
|
||||
{
|
||||
@@ -69,8 +59,6 @@ public class SolrLogger
|
||||
|
||||
private static final LookupService locationService;
|
||||
|
||||
private static final Vector<String> spiderIps;
|
||||
|
||||
private static final boolean useProxies;
|
||||
|
||||
private static Map metadataStorageInfo;
|
||||
@@ -80,7 +68,6 @@ public class SolrLogger
|
||||
log.info("solr.spidersfile:" + ConfigurationManager.getProperty("solr.spidersfile"));
|
||||
log.info("solr.log.server:" + ConfigurationManager.getProperty("solr.log.server"));
|
||||
log.info("solr.dbfile:" + ConfigurationManager.getProperty("solr.dbfile"));
|
||||
log.info("spiders file:" + ConfigurationManager.getProperty("solr.spidersfile"));
|
||||
|
||||
CommonsHttpSolrServer server = null;
|
||||
|
||||
@@ -99,18 +86,7 @@ public class SolrLogger
|
||||
solr = server;
|
||||
|
||||
// Read in the file so we don't have to do it all the time
|
||||
Vector<String> spiderIpsLoc;
|
||||
String filePath = ConfigurationManager.getProperty("solr.spidersfile");
|
||||
try
|
||||
{
|
||||
spiderIpsLoc = SpiderDetector.readIpAddresses(new File(filePath));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
spiderIpsLoc = new Vector<String>();
|
||||
e.printStackTrace(); // Should never happen
|
||||
}
|
||||
spiderIps = spiderIpsLoc;
|
||||
//spiderIps = SpiderDetector.getSpiderIpAddresses();
|
||||
|
||||
LookupService service = null;
|
||||
// Get the db file for the location
|
||||
@@ -154,16 +130,47 @@ public class SolrLogger
|
||||
}
|
||||
}
|
||||
|
||||
public static void post(DSpaceObject dspaceObject, String ip,
|
||||
public static void post(DSpaceObject dspaceObject, HttpServletRequest request,
|
||||
EPerson currentUser)
|
||||
{
|
||||
if (solr == null || locationService == null)
|
||||
return;
|
||||
|
||||
boolean isSpiderBot = SpiderDetector.isSpider(request);
|
||||
|
||||
try
|
||||
{
|
||||
if(isSpiderBot &&
|
||||
!ConfigurationManager.getBooleanProperty("solr.statistics.logBots",true))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
SolrInputDocument doc1 = new SolrInputDocument();
|
||||
// Save our basic info that we already have
|
||||
|
||||
String ip = request.getRemoteAddr();
|
||||
|
||||
if(isUseProxies() && request.getHeader("X-Forwarded-For") != null)
|
||||
{
|
||||
/* This header is a comma delimited list */
|
||||
for(String xfip : request.getHeader("X-Forwarded-For").split(","))
|
||||
{
|
||||
/* proxy itself will sometime populate this header with the same value in
|
||||
remote address. ordering in spec is vague, we'll just take the last
|
||||
not equal to the proxy
|
||||
*/
|
||||
if(!request.getHeader("X-Forwarded-For").contains(ip))
|
||||
{
|
||||
ip = xfip.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
doc1.addField("ip", ip);
|
||||
|
||||
doc1.addField("id", dspaceObject.getID());
|
||||
doc1.addField("type", dspaceObject.getType());
|
||||
// Save the current time
|
||||
@@ -203,7 +210,12 @@ public class SolrLogger
|
||||
doc1.addField("city", location.city);
|
||||
doc1.addField("latitude", location.latitude);
|
||||
doc1.addField("longitude", location.longitude);
|
||||
doc1.addField("isBot",isSpiderBot);
|
||||
|
||||
if(request.getHeader("User-Agent") != null)
|
||||
doc1.addField("userAgent", request.getHeader("User-Agent"));
|
||||
}
|
||||
|
||||
if (dspaceObject instanceof Item)
|
||||
{
|
||||
Item item = (Item) dspaceObject;
|
||||
@@ -226,6 +238,7 @@ public class SolrLogger
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
storeParents(doc1, dspaceObject);
|
||||
|
||||
solr.add(doc1);
|
||||
@@ -355,6 +368,136 @@ public class SolrLogger
|
||||
return currentValsStored;
|
||||
}
|
||||
|
||||
|
||||
public static class ResultProcessor
|
||||
{
|
||||
|
||||
public void execute(String query) throws SolrServerException, IOException {
|
||||
Map<String, String> params = new HashMap<String, String>();
|
||||
params.put("q", query);
|
||||
params.put("rows", "10");
|
||||
MapSolrParams solrParams = new MapSolrParams(params);
|
||||
QueryResponse response = solr.query(solrParams);
|
||||
|
||||
long numbFound = response.getResults().getNumFound();
|
||||
|
||||
// process the first batch
|
||||
process(response.getResults());
|
||||
|
||||
// Run over the rest
|
||||
for (int i = 10; i < numbFound; i += 10)
|
||||
{
|
||||
params.put("start", String.valueOf(i));
|
||||
solrParams = new MapSolrParams(params);
|
||||
response = solr.query(solrParams);
|
||||
process(response.getResults());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void commit() throws IOException, SolrServerException {
|
||||
solr.commit();
|
||||
}
|
||||
|
||||
/**
|
||||
* Override to manage pages of documents
|
||||
* @param docs
|
||||
*/
|
||||
public void process(List<SolrDocument> docs) throws IOException, SolrServerException {
|
||||
for(SolrDocument doc : docs){
|
||||
process(doc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Overide to manage individual documents
|
||||
* @param doc
|
||||
*/
|
||||
public void process(SolrDocument doc) throws IOException, SolrServerException {
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void markRobotsByIP()
|
||||
{
|
||||
for(String ip : SpiderDetector.getSpiderIpAddresses()){
|
||||
|
||||
try {
|
||||
|
||||
/* Result Process to alter record to be identified as a bot */
|
||||
ResultProcessor processor = new ResultProcessor(){
|
||||
public void process(SolrDocument doc) throws IOException, SolrServerException {
|
||||
doc.removeFields("isBot");
|
||||
doc.addField("isBot", true);
|
||||
SolrInputDocument newInput = ClientUtils.toSolrInputDocument(doc);
|
||||
solr.add(newInput);
|
||||
}
|
||||
};
|
||||
|
||||
/* query for ip, exclude results previously set as bots. */
|
||||
processor.execute("ip:"+ip+ "* AND NOT isBot:true");
|
||||
|
||||
solr.commit();
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(),e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void markRobotByUserAgent(String agent){
|
||||
try {
|
||||
|
||||
/* Result Process to alter record to be identified as a bot */
|
||||
ResultProcessor processor = new ResultProcessor(){
|
||||
public void process(SolrDocument doc) throws IOException, SolrServerException {
|
||||
doc.removeFields("isBot");
|
||||
doc.addField("isBot", true);
|
||||
SolrInputDocument newInput = ClientUtils.toSolrInputDocument(doc);
|
||||
solr.add(newInput);
|
||||
}
|
||||
};
|
||||
|
||||
/* query for ip, exclude results previously set as bots. */
|
||||
processor.execute("userAgent:"+agent+ " AND NOT isBot:true");
|
||||
|
||||
solr.commit();
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(),e);
|
||||
}
|
||||
}
|
||||
|
||||
public static void deleteRobotsByIsBotFlag()
|
||||
{
|
||||
try {
|
||||
solr.deleteByQuery("isBot:true");
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(),e);
|
||||
}
|
||||
}
|
||||
|
||||
public static void deleteIP(String ip)
|
||||
{
|
||||
try {
|
||||
solr.deleteByQuery("ip:"+ip + "*");
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(),e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void deleteRobotsByIP()
|
||||
{
|
||||
for(String ip : SpiderDetector.getSpiderIpAddresses()){
|
||||
deleteIP(ip);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* //TODO: below are not used public static void
|
||||
* update(String query, boolean addField, String fieldName, Object
|
||||
@@ -372,25 +515,18 @@ public class SolrLogger
|
||||
// We need to get our documents
|
||||
// QueryResponse queryResponse = solr.query()//query(query, null, -1,
|
||||
// null, null, null);
|
||||
Map<String, String> params = new HashMap<String, String>();
|
||||
params.put("q", query);
|
||||
params.put("rows", "10");
|
||||
MapSolrParams solrParams = new MapSolrParams(params);
|
||||
QueryResponse response = solr.query(solrParams);
|
||||
|
||||
long numbFound = response.getResults().getNumFound();
|
||||
List<SolrDocument> docsToUpdate = new ArrayList<SolrDocument>();
|
||||
docsToUpdate.addAll(response.getResults());
|
||||
final List<SolrDocument> docsToUpdate = new ArrayList<SolrDocument>();
|
||||
|
||||
// Run over the rest
|
||||
for (int i = 10; i < numbFound; i += 10)
|
||||
{
|
||||
params.put("start", String.valueOf(i));
|
||||
solrParams = new MapSolrParams(params);
|
||||
response = solr.query(solrParams);
|
||||
docsToUpdate.addAll(response.getResults());
|
||||
}
|
||||
// We have all the docs delete the once we don't need
|
||||
ResultProcessor processor = new ResultProcessor(){
|
||||
public void process(List<SolrDocument> docs) throws IOException, SolrServerException {
|
||||
docsToUpdate.addAll(docs);
|
||||
}
|
||||
};
|
||||
|
||||
processor.execute(query);
|
||||
|
||||
// We have all the docs delete the ones we don't need
|
||||
solr.deleteByQuery(query);
|
||||
|
||||
// Add the new (updated onces
|
||||
@@ -680,8 +816,17 @@ public class SolrLogger
|
||||
|
||||
// A filter is used instead of a regular query to improve
|
||||
// performance and ensure the search result ordering will
|
||||
// not be influenced
|
||||
solrQuery.addFilterQuery(getIgnoreSpiders());
|
||||
// not be influenced
|
||||
|
||||
// Choose to filter by the Legacy spider IP list (may get too long to properly filter all IP's
|
||||
if(ConfigurationManager.getBooleanProperty("solr.statistics.query.filter.spiderIp",false))
|
||||
solrQuery.addFilterQuery(getIgnoreSpiderIPs());
|
||||
|
||||
// Choose to filter by isBot field, may be overriden in future
|
||||
// to allow views on stats based on bots.
|
||||
if(ConfigurationManager.getBooleanProperty("solr.statistics.query.filter.isBot",true))
|
||||
solrQuery.addFilterQuery("-isBot:true");
|
||||
|
||||
if (filterQuery != null)
|
||||
solrQuery.addFilterQuery(filterQuery);
|
||||
|
||||
@@ -699,20 +844,32 @@ public class SolrLogger
|
||||
return response;
|
||||
}
|
||||
|
||||
|
||||
/** String of IP and Ranges in IPTable as a Solr Query */
|
||||
private static String filterQuery = null;
|
||||
|
||||
/**
|
||||
* Returns in a query string all the ip addresses that should be ignored
|
||||
*
|
||||
* Returns in a filterQuery string all the ip addresses that should be ignored
|
||||
*
|
||||
* @return a string query with ip addresses
|
||||
*/
|
||||
private static String getIgnoreSpiders()
|
||||
{
|
||||
String query = "";
|
||||
for (int i = 0; i < spiderIps.size(); i++)
|
||||
{
|
||||
String ip = spiderIps.elementAt(i);
|
||||
public static String getIgnoreSpiderIPs() {
|
||||
if (filterQuery == null) {
|
||||
String query = "";
|
||||
boolean first = true;
|
||||
for (String ip : SpiderDetector.getSpiderIpAddresses()) {
|
||||
if (first) {
|
||||
query += " AND ";
|
||||
first = false;
|
||||
}
|
||||
|
||||
query += (i != 0 ? " AND " : "") + "NOT(ip: " + ip + ")";
|
||||
query += " NOT(ip: " + ip + ")";
|
||||
}
|
||||
filterQuery = query;
|
||||
}
|
||||
return query;
|
||||
|
||||
return filterQuery;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -13,6 +13,7 @@ package org.dspace.statistics;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.dspace.eperson.EPerson;
|
||||
import org.dspace.services.model.Event;
|
||||
import org.dspace.statistics.util.SpiderDetector;
|
||||
import org.dspace.usage.AbstractUsageEventListener;
|
||||
import org.dspace.usage.UsageEvent;
|
||||
|
||||
@@ -33,21 +34,12 @@ public class SolrLoggerUsageEventListener extends AbstractUsageEventListener {
|
||||
{
|
||||
try{
|
||||
|
||||
UsageEvent ue = (UsageEvent)event;
|
||||
UsageEvent ue = (UsageEvent)event;
|
||||
|
||||
String ip = null;
|
||||
|
||||
if(SolrLogger.isUseProxies())
|
||||
ip = ue.getRequest().getHeader("X-Forwarded-For");
|
||||
|
||||
if(ip == null || ip.equals(""))
|
||||
ip = ue.getRequest().getRemoteAddr();
|
||||
EPerson currentUser = ue.getContext() == null ? null : ue.getContext().getCurrentUser();
|
||||
|
||||
EPerson currentUser = ue.getContext() == null ? null : ue.getContext().getCurrentUser();
|
||||
SolrLogger.post(ue.getObject(), ue.getRequest(), currentUser);
|
||||
|
||||
SolrLogger.post(ue.getObject(), ip, currentUser);
|
||||
|
||||
|
||||
}
|
||||
catch(Exception e)
|
||||
{
|
||||
|
@@ -0,0 +1,97 @@
|
||||
/**
|
||||
* $Id: $
|
||||
* $URL: $
|
||||
* *************************************************************************
|
||||
* Copyright (c) 2002-2009, DuraSpace. All rights reserved
|
||||
* Licensed under the DuraSpace Foundation License.
|
||||
*
|
||||
* A copy of the DuraSpace License has been included in this
|
||||
* distribution and is available at: http://scm.dspace.org/svn/repo/licenses/LICENSE.txt
|
||||
*/
|
||||
package org.dspace.statistics.util;
|
||||
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
import org.apache.commons.cli.CommandLineParser;
|
||||
import org.apache.commons.cli.Options;
|
||||
import org.apache.commons.cli.PosixParser;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.HashSet;
|
||||
|
||||
/**
|
||||
* @author Mark Diggory (mdiggory at atmire.com)
|
||||
* @author kevinvandevelde at atmire.com
|
||||
* @author ben at atmire.com
|
||||
*/
|
||||
public class ApacheLogRobotsProcessor {
|
||||
|
||||
|
||||
/**
|
||||
* Creates a file containing spiders based on an apache logfile
|
||||
* by analyzing users of the robots.txt file
|
||||
*
|
||||
* @param args
|
||||
* @throws Exception
|
||||
*/
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// create an options object and populate it
|
||||
CommandLineParser parser = new PosixParser();
|
||||
|
||||
Options options = new Options();
|
||||
options.addOption("l", "logfile", true, "type: Input log file");
|
||||
options.addOption("s", "spiderfile", true, "type: Spider ip file");
|
||||
|
||||
CommandLine line = parser.parse(options, args);
|
||||
|
||||
String logFileLoc;
|
||||
String spiderIpPath;
|
||||
if (line.hasOption("l"))
|
||||
logFileLoc = line.getOptionValue("l");
|
||||
else {
|
||||
System.out.println("We need our log file");
|
||||
return;
|
||||
}
|
||||
if (line.hasOption("s"))
|
||||
spiderIpPath = line.getOptionValue("s");
|
||||
else {
|
||||
System.out.println("We need a spider ip output file");
|
||||
return;
|
||||
}
|
||||
|
||||
File spiderIpFile = new File(spiderIpPath);
|
||||
|
||||
//Get the ip's already added in our file
|
||||
HashSet<String> logSpiders = new HashSet<String>();
|
||||
if (spiderIpFile.exists())
|
||||
logSpiders = SpiderDetector.readIpAddresses(spiderIpFile);
|
||||
|
||||
|
||||
//First read in our log file line per line
|
||||
BufferedReader in = new BufferedReader(new FileReader(logFileLoc));
|
||||
String logLine;
|
||||
while ((logLine = in.readLine()) != null) {
|
||||
//Currently only check if robot.txt is present in our line
|
||||
if (logLine.contains("robots.txt")) {
|
||||
//We got a robots.txt so we got a bot
|
||||
String ip = logLine.substring(0, logLine.indexOf("-")).trim();
|
||||
//Only add single ip addresses once we got it in it is enough
|
||||
logSpiders.add(ip);
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
|
||||
//Last but not least add the ips to our file
|
||||
BufferedWriter output = new BufferedWriter(new FileWriter(spiderIpFile));
|
||||
|
||||
//Second write the new ips
|
||||
for (String ip : logSpiders) {
|
||||
System.out.println("Adding new ip: " + ip);
|
||||
//Write each new ip on a seperate line
|
||||
output.write(ip + "\n");
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
}
|
@@ -0,0 +1,184 @@
|
||||
/**
|
||||
* $Id: $
|
||||
* $URL: $
|
||||
* *************************************************************************
|
||||
* Copyright (c) 2002-2009, DuraSpace. All rights reserved
|
||||
* Licensed under the DuraSpace Foundation License.
|
||||
*
|
||||
* A copy of the DuraSpace License has been included in this
|
||||
* distribution and is available at: http://scm.dspace.org/svn/repo/licenses/LICENSE.txt
|
||||
*/
|
||||
package org.dspace.statistics.util;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A Spare v4 IPTable implementation that uses nested HashMaps
|
||||
* TO optimize IP Address matching over ranges of IP Addresses.
|
||||
*
|
||||
* @author: mdiggory at atmire.com
|
||||
*/
|
||||
public class IPTable {
|
||||
|
||||
/* A lookup tree for IP Addresses and SubnetRanges */
|
||||
private HashMap<String, HashMap<String, HashMap<String, HashSet<String>>>> map =
|
||||
new HashMap<String, HashMap<String, HashMap<String, HashSet<String>>>>();
|
||||
|
||||
/**
|
||||
* Can be full v4 IP, subnet or range string
|
||||
*
|
||||
* @param ip
|
||||
*/
|
||||
public void add(String ip) throws IPFormatException {
|
||||
|
||||
String[] start;
|
||||
|
||||
String[] end;
|
||||
|
||||
String[] range = ip.split("-");
|
||||
|
||||
if (range.length >= 2) {
|
||||
|
||||
start = range[0].trim().split("/")[0].split("\\.");
|
||||
end = range[1].trim().split("/")[0].split("\\.");
|
||||
|
||||
if (start.length != 4 || end.length != 4)
|
||||
throw new IPFormatException(ip + " - Ranges need to be full IPv4 Addresses");
|
||||
|
||||
if (!(start[0].equals(end[0]) && start[1].equals(end[1]) && start[2].equals(end[2]))) {
|
||||
throw new IPFormatException(ip + " - Ranges can only be across the last subnet x.y.z.0 - x.y.z.254");
|
||||
}
|
||||
|
||||
} else {
|
||||
//need to ignore CIDR notation for the moment.
|
||||
//ip = ip.split("\\/")[0];
|
||||
|
||||
String[] subnets = ip.split("\\.");
|
||||
|
||||
if (subnets.length < 3) {
|
||||
throw new IPFormatException(ip + " - require at least three subnet places (255.255.255.0");
|
||||
|
||||
}
|
||||
|
||||
start = subnets;
|
||||
end = subnets;
|
||||
}
|
||||
|
||||
if (start.length >= 3) {
|
||||
|
||||
|
||||
HashMap<String, HashMap<String, HashSet<String>>> first = map.get(start[0]);
|
||||
|
||||
if (first == null) {
|
||||
first = new HashMap<String, HashMap<String, HashSet<String>>>();
|
||||
map.put(start[0], first);
|
||||
}
|
||||
|
||||
HashMap<String, HashSet<String>> second = first.get(start[1]);
|
||||
|
||||
|
||||
if (second == null) {
|
||||
second = new HashMap<String, HashSet<String>>();
|
||||
first.put(start[1], second);
|
||||
}
|
||||
|
||||
HashSet<String> third = second.get(start[2]);
|
||||
|
||||
if (third == null) {
|
||||
third = new HashSet<String>();
|
||||
second.put(start[2], third);
|
||||
}
|
||||
|
||||
//now populate fourth place (* or value 0-254);
|
||||
|
||||
if (start.length == 3) {
|
||||
third.add("*");
|
||||
}
|
||||
|
||||
if (third.contains("*")) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (start.length >= 4) {
|
||||
int s = Integer.valueOf(start[3]);
|
||||
int e = Integer.valueOf(end[3]);
|
||||
for (int i = s; i <= e; i++) {
|
||||
third.add(String.valueOf(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean contains(String ip) throws IPFormatException {
|
||||
|
||||
String[] subnets = ip.split("\\.");
|
||||
|
||||
if (subnets.length != 4)
|
||||
throw new IPFormatException("needs to be single IP Address");
|
||||
|
||||
HashMap<String, HashMap<String, HashSet<String>>> first = map.get(subnets[0]);
|
||||
|
||||
if (first == null) return false;
|
||||
|
||||
HashMap<String, HashSet<String>> second = first.get(subnets[1]);
|
||||
|
||||
if (second == null) return false;
|
||||
|
||||
HashSet<String> third = second.get(subnets[2]);
|
||||
|
||||
if (third == null) return false;
|
||||
|
||||
return third.contains(subnets[3]) || third.contains("*");
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
public Set<String> toSet() {
|
||||
HashSet<String> set = new HashSet<String>();
|
||||
|
||||
for (Map.Entry<String, HashMap<String, HashMap<String, HashSet<String>>>> first : map.entrySet()) {
|
||||
String firstString = first.getKey();
|
||||
HashMap<String, HashMap<String, HashSet<String>>> secondMap = first.getValue();
|
||||
|
||||
for (Map.Entry<String, HashMap<String, HashSet<String>>> second : secondMap.entrySet()) {
|
||||
String secondString = second.getKey();
|
||||
HashMap<String, HashSet<String>> thirdMap = second.getValue();
|
||||
|
||||
for (Map.Entry<String, HashSet<String>> third : thirdMap.entrySet()) {
|
||||
String thirdString = third.getKey();
|
||||
HashSet<String> fourthSet = third.getValue();
|
||||
|
||||
if (fourthSet.contains("*")) {
|
||||
set.add(firstString + "." + secondString + "." + thirdString);
|
||||
} else {
|
||||
for (String fourth : fourthSet) {
|
||||
set.add(firstString + "." + secondString + "." + thirdString + "." + fourth);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Exception Class to deal with IPFormat errors.
|
||||
*/
|
||||
public class IPFormatException extends Exception {
|
||||
public IPFormatException(String s) {
|
||||
super(s);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@@ -10,123 +10,170 @@
|
||||
*/
|
||||
package org.dspace.statistics.util;
|
||||
|
||||
import org.apache.commons.cli.CommandLineParser;
|
||||
import org.apache.commons.cli.PosixParser;
|
||||
import org.apache.commons.cli.Options;
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.dspace.core.ConfigurationManager;
|
||||
import org.dspace.statistics.SolrLogger;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Vector;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* SpiderDetector is used to find IP's that are spiders...
|
||||
* In future someone may add UserAgents and Host Domains
|
||||
* to the detection criteria here.
|
||||
*
|
||||
* @author kevinvandevelde at atmire.com
|
||||
* @author ben at atmire.com
|
||||
* @author Mark Diggory (mdiggory at atmire.com)
|
||||
*/
|
||||
public class SpiderDetector {
|
||||
|
||||
private static Logger log = Logger.getLogger(SpiderDetector.class);
|
||||
|
||||
/**
|
||||
* Creates a file containing spiders based on an apache logfile
|
||||
* by analyzing users of the robots.txt file
|
||||
* @param args
|
||||
* @throws Exception
|
||||
* Sparse HAshTable structure to hold IP Address Ranges.
|
||||
*/
|
||||
public static void main(String[] args) throws Exception{
|
||||
// create an options object and populate it
|
||||
CommandLineParser parser = new PosixParser();
|
||||
|
||||
Options options = new Options();
|
||||
options.addOption("l", "logfile", true, "type: Input log file");
|
||||
options.addOption("s", "spiderfile", true, "type: Spider ip file");
|
||||
|
||||
CommandLine line = parser.parse(options, args);
|
||||
|
||||
String logFileLoc;
|
||||
String spiderIpPath;
|
||||
if(line.hasOption("l"))
|
||||
logFileLoc = line.getOptionValue("l");
|
||||
else{
|
||||
System.out.println("We need our log file");
|
||||
return;
|
||||
}
|
||||
if(line.hasOption("s"))
|
||||
spiderIpPath = line.getOptionValue("s");
|
||||
else{
|
||||
System.out.println("We need a spider ip output file");
|
||||
return;
|
||||
}
|
||||
|
||||
//First read in our log file line per line
|
||||
BufferedReader in = new BufferedReader(new FileReader(logFileLoc));
|
||||
Vector<String> spiders = new Vector<String>();
|
||||
String logLine;
|
||||
while ((logLine = in.readLine()) != null){
|
||||
//Currently only check if robot.txt is present in our line
|
||||
if(logLine.contains("robots.txt")){
|
||||
//We got a robots.txt so we got a bot
|
||||
String ip = logLine.substring(0, logLine.indexOf("-")).trim();
|
||||
//Only add single ip addresses once we got it in it is enough
|
||||
if(!spiders.contains(ip))
|
||||
spiders.add(ip);
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
|
||||
//Get the output file
|
||||
File spiderIpFile = new File(spiderIpPath);
|
||||
//Get the ip's already added in our file
|
||||
Vector<String> oldSpiderIds = new Vector<String>();
|
||||
if(spiderIpFile.exists())
|
||||
oldSpiderIds = readIpAddresses(spiderIpFile);
|
||||
|
||||
Vector<String> newSpiderIds = new Vector<String>();
|
||||
|
||||
//Now run over all these naughty spiders & add em to our overview file
|
||||
//PS: only add them if not present
|
||||
for (int i = 0; i < spiders.size(); i++) {
|
||||
String spiderIp = spiders.elementAt(i);
|
||||
if(!oldSpiderIds.contains(spiderIp))
|
||||
newSpiderIds.add(spiderIp);
|
||||
}
|
||||
|
||||
//Last but not least add the ips to our file
|
||||
BufferedWriter output = new BufferedWriter(new FileWriter(spiderIpFile));
|
||||
//First write the old ips back so we don't lose any
|
||||
for (int i = 0; i < oldSpiderIds.size(); i++) {
|
||||
String ip = oldSpiderIds.elementAt(i);
|
||||
output.write(ip + "\n");
|
||||
}
|
||||
|
||||
//Second write the new ips
|
||||
for (int i = 0; i < newSpiderIds.size(); i++) {
|
||||
String ip = newSpiderIds.elementAt(i);
|
||||
System.out.println("Adding new ip: " + ip);
|
||||
//Write each new ip on a seperate line
|
||||
output.write(ip + "\n");
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
private static IPTable table = null;
|
||||
|
||||
/**
|
||||
* Reads the ip addresses out a file & returns them in a vector
|
||||
* Utility method which Reads the ip addresses out a file & returns them in a Set
|
||||
*
|
||||
* @param spiderIpFile the location of our spider file
|
||||
* @return a vector full of ip's
|
||||
* @throws IOException could not happen since we check the file be4 we use it
|
||||
*/
|
||||
public static Vector<String> readIpAddresses(File spiderIpFile) throws IOException {
|
||||
Vector<String> ips = new Vector<String>();
|
||||
if(!spiderIpFile.exists())
|
||||
public static HashSet<String> readIpAddresses(File spiderIpFile) throws IOException {
|
||||
HashSet<String> ips = new HashSet<String>();
|
||||
|
||||
if (!spiderIpFile.exists() || !spiderIpFile.isFile())
|
||||
return ips;
|
||||
|
||||
//Read our file & get all them ip's
|
||||
BufferedReader in = new BufferedReader(new FileReader(spiderIpFile));
|
||||
String ip;
|
||||
while((ip = in.readLine()) != null){
|
||||
ips.add(ip);
|
||||
String line;
|
||||
while ((line = in.readLine()) != null) {
|
||||
if (!line.startsWith("#")) {
|
||||
line = line.trim();
|
||||
|
||||
if (!line.equals("") && !Character.isDigit(line.charAt(0))) {
|
||||
// is a hostname
|
||||
// add this functionality later...
|
||||
} else if (!line.equals("")) {
|
||||
ips.add(line);
|
||||
// is full v4 ip (too tired to deal with v6)...
|
||||
}
|
||||
} else {
|
||||
// ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim());
|
||||
// ... add this functionality later
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
return ips;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an immutable Set representing all the Spider Addresses here
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static Set<String> getSpiderIpAddresses() {
|
||||
|
||||
loadSpiderIpAddresses();
|
||||
return table.toSet();
|
||||
}
|
||||
|
||||
/*
|
||||
private loader to populate the table from files.
|
||||
*/
|
||||
|
||||
private static void loadSpiderIpAddresses() {
|
||||
|
||||
|
||||
if (table == null) {
|
||||
table = new IPTable();
|
||||
|
||||
String filePath = ConfigurationManager.getProperty("dspace.dir");
|
||||
|
||||
try {
|
||||
File spidersDir = new File(filePath, "config/spiders");
|
||||
|
||||
if (spidersDir.exists() && spidersDir.isDirectory()) {
|
||||
for (File file : spidersDir.listFiles()) {
|
||||
for (String ip : readIpAddresses(file)) {
|
||||
table.add(ip);
|
||||
}
|
||||
log.info("Loaded Spider IP file: " + file);
|
||||
}
|
||||
} else {
|
||||
log.info("No spider file loaded");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
catch (Exception e) {
|
||||
log.error("Error Loading Spiders:" + e.getMessage(), e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Static Service Method for testing spiders against existing spider files.
|
||||
* <p/>
|
||||
* In the future this will be extended to support User Agent and
|
||||
* domain Name detection.
|
||||
* <p/>
|
||||
* In future spiders HashSet may be optimized as byte offset array to
|
||||
* improve performance and memory footprint further.
|
||||
*
|
||||
* @param request
|
||||
* @return true|false if the request was detected to be from a spider
|
||||
*/
|
||||
public static boolean isSpider(HttpServletRequest request) {
|
||||
|
||||
if (SolrLogger.isUseProxies() && request.getHeader("X-Forwarded-For") != null) {
|
||||
/* This header is a comma delimited list */
|
||||
for (String xfip : request.getHeader("X-Forwarded-For").split(",")) {
|
||||
if (isSpider(xfip))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return isSpider(request.getRemoteAddr());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Check individual IP is a spider.
|
||||
*
|
||||
* @param ip
|
||||
* @return if is spider IP
|
||||
*/
|
||||
public static boolean isSpider(String ip) {
|
||||
|
||||
if (table == null) {
|
||||
SpiderDetector.loadSpiderIpAddresses();
|
||||
}
|
||||
|
||||
try {
|
||||
if (table.contains(ip)) {
|
||||
return true;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -2009,8 +2009,30 @@ harvester.unknownSchema = fail
|
||||
|
||||
##### Usage Logging #####
|
||||
solr.log.server = ${dspace.baseUrl}/solr/statistics
|
||||
solr.spidersfile = ${dspace.dir}/config/spiders.txt
|
||||
solr.dbfile = ${dspace.dir}/config/GeoLiteCity.dat
|
||||
useProxies = true
|
||||
|
||||
statistics.item.authorization.admin=true
|
||||
# If enabled the statistics system will look for an X-Forward header
|
||||
# if it finds it, it will use this for the user IP Addrress
|
||||
# it is enabled by default
|
||||
# useProxies = true
|
||||
|
||||
# Control if the statistics pages should be only shown to authorized users
|
||||
statistics.item.authorization.admin=true
|
||||
|
||||
# control solr statistics querying to filter out spider IPs
|
||||
# false by default
|
||||
# solr.statistics.query.filter.spiderIp = false
|
||||
|
||||
# control slor statistics querying to look at "isBot" field to determine
|
||||
# if record is a bot. true by default.
|
||||
# solr.statistics.query.filter.isBot = true
|
||||
|
||||
# URLs to download IP addresses of search engine spiders from
|
||||
solr.spiderips.urls = http://iplists.com/google.txt, \
|
||||
http://iplists.com/inktomi.txt, \
|
||||
http://iplists.com/lycos.txt, \
|
||||
http://iplists.com/infoseek.txt, \
|
||||
http://iplists.com/altavista.txt, \
|
||||
http://iplists.com/excite.txt, \
|
||||
http://iplists.com/misc.txt, \
|
||||
http://iplists.com/non_engines.txt
|
@@ -301,4 +301,12 @@
|
||||
</step>
|
||||
</command>
|
||||
|
||||
<command>
|
||||
<name>update-spider-ips</name>
|
||||
<description>Update the list of known search engine IP addresses</description>
|
||||
<step>
|
||||
<class>org.dspace.statistics.util.DownloadSpiderIPs</class>
|
||||
</step>
|
||||
</command>
|
||||
|
||||
</commands>
|
@@ -289,7 +289,10 @@
|
||||
<field name="owningComm" type="integer" indexed="true" stored="true" required="false" multiValued="true" />
|
||||
<field name="owningColl" type="integer" indexed="true" stored="true" required="false" multiValued="true" />
|
||||
<field name="owningItem" type="integer" indexed="true" stored="true" required="false" multiValued="true" />
|
||||
<field name="dns" type="string" indexed="true" stored="true" required="false"/>
|
||||
<field name="dns" type="string" indexed="true" stored="true" required="false"/>
|
||||
<field name="userAgent" type="string" indexed="true" stored="true" required="false"/>
|
||||
<field name="isBot" type="boolean" indexed="true" stored="true" required="false"/>
|
||||
|
||||
</fields>
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user