Add matching of User-Agent header, for well-behaved bots with

ever-expanding address pools.

While we're at it, stop punting DNS names found in the IP address
files.  Note that this is *not* DNS *pattern* matching; it only resolves
individual host names to IP addresses at load time.
This commit is contained in:
Mark H. Wood
2013-06-20 15:16:43 -04:00
parent 8a02466391
commit 81f24f1aa0
3 changed files with 114 additions and 29 deletions

View File

@@ -80,7 +80,7 @@ public class ApacheLogRobotsProcessor {
if (spiderIpFile.exists()) if (spiderIpFile.exists())
{ {
logSpiders = SpiderDetector.readIpAddresses(spiderIpFile); logSpiders = SpiderDetector.readPatterns(spiderIpFile);
} }
else else
{ {

View File

@@ -14,12 +14,20 @@ import java.io.IOException;
/** /**
* XBill DNS resolver to retrieve hostnames for client IP addresses. * XBill DNS resolver to retrieve hostnames for client IP addresses.
* TODO: deal with IPv6 addresses.
* *
* @author kevinvandevelde at atmire.com * @author kevinvandevelde at atmire.com
* @author ben at atmire.com * @author ben at atmire.com
*/ */
public class DnsLookup { public class DnsLookup {
/**
* Resolve an IP address to a host name.
*
* @param hostIp dotted decimal IPv4 address.
* @return name if resolved, or the address.
* @throws IOException from infrastructure.
*/
public static String reverseDns(String hostIp) throws IOException { public static String reverseDns(String hostIp) throws IOException {
Resolver res = new ExtendedResolver(); Resolver res = new ExtendedResolver();
@@ -44,4 +52,45 @@ public class DnsLookup {
return answers[0].rdataToString(); return answers[0].rdataToString();
} }
} }
/**
* Resolve a host name to an IPv4 address.
* @throws IOException from infrastructure or no resolution.
*/
public static String forward(String hostname)
throws IOException
{
Resolver res = new ExtendedResolver();
int timeout = ConfigurationManager.getIntProperty("usage-statistics",
"resolver.timeout", 200);
res.setTimeout(0, timeout);
Name name = Name.fromString(hostname, Name.root);
Record rec = Record.newRecord(name, Type.A, DClass.IN);
Message query = Message.newQuery(rec);
Message response = res.send(query);
Record[] answers = response.getSectionArray(Section.ANSWER);
if (answers.length == 0)
{
throw new IOException("Unresolvable host name (empty response)");
}
String resolution = null;
for (Record answer : answers)
{
if (answer.getType() == Type.A)
{
resolution = answer.rdataToString();
break;
}
}
if (null == resolution)
{
throw new IOException("Unresolvable host name (no A record)");
}
return resolution;
}
} }

View File

@@ -17,8 +17,9 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletRequest;
import org.apache.log4j.Logger;
import org.dspace.core.ConfigurationManager; import org.dspace.core.ConfigurationManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** /**
* SpiderDetector is used to find IP's that are spiders... * SpiderDetector is used to find IP's that are spiders...
@@ -31,7 +32,7 @@ import org.dspace.core.ConfigurationManager;
*/ */
public class SpiderDetector { public class SpiderDetector {
private static Logger log = Logger.getLogger(SpiderDetector.class); private static Logger log = LoggerFactory.getLogger(SpiderDetector.class);
private static Boolean useProxies; private static Boolean useProxies;
@@ -44,33 +45,31 @@ public class SpiderDetector {
private static List<Pattern> agents = new ArrayList<Pattern>(); private static List<Pattern> agents = new ArrayList<Pattern>();
/** /**
* Utility method which reads IP addresses from a file & returns them in a Set. * Utility method which reads lines from a file & returns them in a Set.
* *
* @param spiderIpFile the location of our spider file * @param patternFile the location of our spider file
* @return a vector full of IPs * @return a vector full of patterns
* @throws IOException could not happen since we check the file be4 we use it * @throws IOException could not happen since we check the file be4 we use it
*/ */
public static Set<String> readIpAddresses(File spiderIpFile) throws IOException { public static Set<String> readPatterns(File patternFile)
Set<String> ips = new HashSet<String>(); throws IOException
{
Set<String> patterns = new HashSet<String>();
if (!spiderIpFile.exists() || !spiderIpFile.isFile()) if (!patternFile.exists() || !patternFile.isFile())
{ {
return ips; return patterns;
} }
//Read our file & get all them ip's //Read our file & get all them patterns.
BufferedReader in = new BufferedReader(new FileReader(spiderIpFile)); BufferedReader in = new BufferedReader(new FileReader(patternFile));
String line; String line;
while ((line = in.readLine()) != null) { while ((line = in.readLine()) != null) {
if (!line.startsWith("#")) { if (!line.startsWith("#")) {
line = line.trim(); line = line.trim();
if (!line.equals("") && !Character.isDigit(line.charAt(0))) { if (!line.equals("")) {
// is a hostname patterns.add(line);
// add this functionality later...
} else if (!line.equals("")) {
ips.add(line);
// is full v4 ip (too tired to deal with v6)...
} }
} else { } else {
// ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim()); // ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim());
@@ -78,7 +77,7 @@ public class SpiderDetector {
} }
} }
in.close(); in.close();
return ips; return patterns;
} }
/** /**
@@ -124,12 +123,11 @@ public class SpiderDetector {
} }
/* /*
private loader to populate the table from files. * private loader to populate the table from files.
*/ */
private static void loadSpiderIpAddresses() { private static void loadSpiderIpAddresses() {
if (table == null) { if (table == null) {
table = new IPTable(); table = new IPTable();
@@ -140,7 +138,18 @@ public class SpiderDetector {
if (spidersDir.exists() && spidersDir.isDirectory()) { if (spidersDir.exists() && spidersDir.isDirectory()) {
for (File file : spidersDir.listFiles()) { for (File file : spidersDir.listFiles()) {
for (String ip : readIpAddresses(file)) { for (String ip : readPatterns(file)) {
log.debug("Loading {}", ip);
if (!Character.isDigit(ip.charAt(0)))
{
try {
ip = DnsLookup.forward(ip);
log.debug("Resolved to {}", ip);
} catch (IOException e) {
log.warn("Not loading {}: {}", ip, e.getMessage());
continue;
}
}
table.add(ip); table.add(ip);
} }
log.info("Loaded Spider IP file: " + file); log.info("Loaded Spider IP file: " + file);
@@ -148,18 +157,43 @@ public class SpiderDetector {
} else { } else {
log.info("No spider file loaded"); log.info("No spider file loaded");
} }
} }
catch (Exception e) { catch (Exception e) {
log.error("Error Loading Spiders:" + e.getMessage(), e); log.error("Error Loading Spiders:" + e.getMessage(), e);
} }
} }
} }
/** Load agent name patterns from all files in config/spiders/agents. */
private static void loadAgentPatterns()
{
String dspaceHome = ConfigurationManager.getProperty("dspace.dir");
File agentsDir = new File(dspaceHome, "config/spiders/agents");
if (agentsDir.exists() && agentsDir.isDirectory())
{
for (File file : agentsDir.listFiles())
{
Set<String> patterns;
try
{
patterns = readPatterns(file);
} catch (IOException ex)
{
log.error("Agent patterns not read from {}: {}",
file.getPath(), ex.getMessage());
continue;
}
for (String pattern : patterns)
{
agents.add(Pattern.compile(pattern));
}
}
}
}
/* TODO Load host name patterns from all files in config/spiders/dns. */
/** /**
* Static Service Method for testing spiders against existing spider files. * Static Service Method for testing spiders against existing spider files.
@@ -172,12 +206,15 @@ public class SpiderDetector {
* @param request * @param request
* @return true|false if the request was detected to be from a spider * @return true|false if the request was detected to be from a spider
*/ */
public static boolean isSpider(HttpServletRequest request) { public static boolean isSpider(HttpServletRequest request)
{
// See if any agent patterns match // See if any agent patterns match
String agent = request.getHeader("User-Agent"); String agent = request.getHeader("User-Agent");
if ((null != agent) && (null != agents)) if (null != agent)
{ {
if (null == agents)
loadAgentPatterns();
for (Pattern candidate : agents) for (Pattern candidate : agents)
{ {
if (candidate.matcher(agent).find()) if (candidate.matcher(agent).find())
@@ -199,7 +236,6 @@ public class SpiderDetector {
} }
return isSpider(request.getRemoteAddr()); return isSpider(request.getRemoteAddr());
} }
/** /**