mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-17 23:13:10 +00:00
Add matching of User-Agent header, for well-behaved bots with
ever-expanding address pools. While we're at it, stop punting DNS names found in the IP address files. Note that this is *not* DNS *pattern* matching; it only resolves individual host names to IP addresses at load time.
This commit is contained in:
@@ -80,7 +80,7 @@ public class ApacheLogRobotsProcessor {
|
|||||||
|
|
||||||
if (spiderIpFile.exists())
|
if (spiderIpFile.exists())
|
||||||
{
|
{
|
||||||
logSpiders = SpiderDetector.readIpAddresses(spiderIpFile);
|
logSpiders = SpiderDetector.readPatterns(spiderIpFile);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@@ -14,12 +14,20 @@ import java.io.IOException;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* XBill DNS resolver to retrieve hostnames for client IP addresses.
|
* XBill DNS resolver to retrieve hostnames for client IP addresses.
|
||||||
|
* TODO: deal with IPv6 addresses.
|
||||||
*
|
*
|
||||||
* @author kevinvandevelde at atmire.com
|
* @author kevinvandevelde at atmire.com
|
||||||
* @author ben at atmire.com
|
* @author ben at atmire.com
|
||||||
*/
|
*/
|
||||||
public class DnsLookup {
|
public class DnsLookup {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve an IP address to a host name.
|
||||||
|
*
|
||||||
|
* @param hostIp dotted decimal IPv4 address.
|
||||||
|
* @return name if resolved, or the address.
|
||||||
|
* @throws IOException from infrastructure.
|
||||||
|
*/
|
||||||
public static String reverseDns(String hostIp) throws IOException {
|
public static String reverseDns(String hostIp) throws IOException {
|
||||||
Resolver res = new ExtendedResolver();
|
Resolver res = new ExtendedResolver();
|
||||||
|
|
||||||
@@ -44,4 +52,45 @@ public class DnsLookup {
|
|||||||
return answers[0].rdataToString();
|
return answers[0].rdataToString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve a host name to an IPv4 address.
|
||||||
|
* @throws IOException from infrastructure or no resolution.
|
||||||
|
*/
|
||||||
|
public static String forward(String hostname)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
Resolver res = new ExtendedResolver();
|
||||||
|
int timeout = ConfigurationManager.getIntProperty("usage-statistics",
|
||||||
|
"resolver.timeout", 200);
|
||||||
|
res.setTimeout(0, timeout);
|
||||||
|
|
||||||
|
Name name = Name.fromString(hostname, Name.root);
|
||||||
|
Record rec = Record.newRecord(name, Type.A, DClass.IN);
|
||||||
|
Message query = Message.newQuery(rec);
|
||||||
|
Message response = res.send(query);
|
||||||
|
|
||||||
|
Record[] answers = response.getSectionArray(Section.ANSWER);
|
||||||
|
if (answers.length == 0)
|
||||||
|
{
|
||||||
|
throw new IOException("Unresolvable host name (empty response)");
|
||||||
|
}
|
||||||
|
|
||||||
|
String resolution = null;
|
||||||
|
for (Record answer : answers)
|
||||||
|
{
|
||||||
|
if (answer.getType() == Type.A)
|
||||||
|
{
|
||||||
|
resolution = answer.rdataToString();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (null == resolution)
|
||||||
|
{
|
||||||
|
throw new IOException("Unresolvable host name (no A record)");
|
||||||
|
}
|
||||||
|
|
||||||
|
return resolution;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@@ -17,8 +17,9 @@ import java.util.List;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import javax.servlet.http.HttpServletRequest;
|
import javax.servlet.http.HttpServletRequest;
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
import org.dspace.core.ConfigurationManager;
|
import org.dspace.core.ConfigurationManager;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* SpiderDetector is used to find IP's that are spiders...
|
* SpiderDetector is used to find IP's that are spiders...
|
||||||
@@ -31,7 +32,7 @@ import org.dspace.core.ConfigurationManager;
|
|||||||
*/
|
*/
|
||||||
public class SpiderDetector {
|
public class SpiderDetector {
|
||||||
|
|
||||||
private static Logger log = Logger.getLogger(SpiderDetector.class);
|
private static Logger log = LoggerFactory.getLogger(SpiderDetector.class);
|
||||||
|
|
||||||
private static Boolean useProxies;
|
private static Boolean useProxies;
|
||||||
|
|
||||||
@@ -44,33 +45,31 @@ public class SpiderDetector {
|
|||||||
private static List<Pattern> agents = new ArrayList<Pattern>();
|
private static List<Pattern> agents = new ArrayList<Pattern>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility method which reads IP addresses from a file & returns them in a Set.
|
* Utility method which reads lines from a file & returns them in a Set.
|
||||||
*
|
*
|
||||||
* @param spiderIpFile the location of our spider file
|
* @param patternFile the location of our spider file
|
||||||
* @return a vector full of IPs
|
* @return a vector full of patterns
|
||||||
* @throws IOException could not happen since we check the file be4 we use it
|
* @throws IOException could not happen since we check the file be4 we use it
|
||||||
*/
|
*/
|
||||||
public static Set<String> readIpAddresses(File spiderIpFile) throws IOException {
|
public static Set<String> readPatterns(File patternFile)
|
||||||
Set<String> ips = new HashSet<String>();
|
throws IOException
|
||||||
|
{
|
||||||
|
Set<String> patterns = new HashSet<String>();
|
||||||
|
|
||||||
if (!spiderIpFile.exists() || !spiderIpFile.isFile())
|
if (!patternFile.exists() || !patternFile.isFile())
|
||||||
{
|
{
|
||||||
return ips;
|
return patterns;
|
||||||
}
|
}
|
||||||
|
|
||||||
//Read our file & get all them ip's
|
//Read our file & get all them patterns.
|
||||||
BufferedReader in = new BufferedReader(new FileReader(spiderIpFile));
|
BufferedReader in = new BufferedReader(new FileReader(patternFile));
|
||||||
String line;
|
String line;
|
||||||
while ((line = in.readLine()) != null) {
|
while ((line = in.readLine()) != null) {
|
||||||
if (!line.startsWith("#")) {
|
if (!line.startsWith("#")) {
|
||||||
line = line.trim();
|
line = line.trim();
|
||||||
|
|
||||||
if (!line.equals("") && !Character.isDigit(line.charAt(0))) {
|
if (!line.equals("")) {
|
||||||
// is a hostname
|
patterns.add(line);
|
||||||
// add this functionality later...
|
|
||||||
} else if (!line.equals("")) {
|
|
||||||
ips.add(line);
|
|
||||||
// is full v4 ip (too tired to deal with v6)...
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim());
|
// ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim());
|
||||||
@@ -78,7 +77,7 @@ public class SpiderDetector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
in.close();
|
in.close();
|
||||||
return ips;
|
return patterns;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -124,12 +123,11 @@ public class SpiderDetector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
private loader to populate the table from files.
|
* private loader to populate the table from files.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
private static void loadSpiderIpAddresses() {
|
private static void loadSpiderIpAddresses() {
|
||||||
|
|
||||||
|
|
||||||
if (table == null) {
|
if (table == null) {
|
||||||
table = new IPTable();
|
table = new IPTable();
|
||||||
|
|
||||||
@@ -140,7 +138,18 @@ public class SpiderDetector {
|
|||||||
|
|
||||||
if (spidersDir.exists() && spidersDir.isDirectory()) {
|
if (spidersDir.exists() && spidersDir.isDirectory()) {
|
||||||
for (File file : spidersDir.listFiles()) {
|
for (File file : spidersDir.listFiles()) {
|
||||||
for (String ip : readIpAddresses(file)) {
|
for (String ip : readPatterns(file)) {
|
||||||
|
log.debug("Loading {}", ip);
|
||||||
|
if (!Character.isDigit(ip.charAt(0)))
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
ip = DnsLookup.forward(ip);
|
||||||
|
log.debug("Resolved to {}", ip);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.warn("Not loading {}: {}", ip, e.getMessage());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
table.add(ip);
|
table.add(ip);
|
||||||
}
|
}
|
||||||
log.info("Loaded Spider IP file: " + file);
|
log.info("Loaded Spider IP file: " + file);
|
||||||
@@ -148,18 +157,43 @@ public class SpiderDetector {
|
|||||||
} else {
|
} else {
|
||||||
log.info("No spider file loaded");
|
log.info("No spider file loaded");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
log.error("Error Loading Spiders:" + e.getMessage(), e);
|
log.error("Error Loading Spiders:" + e.getMessage(), e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Load agent name patterns from all files in config/spiders/agents. */
|
||||||
|
private static void loadAgentPatterns()
|
||||||
|
{
|
||||||
|
String dspaceHome = ConfigurationManager.getProperty("dspace.dir");
|
||||||
|
File agentsDir = new File(dspaceHome, "config/spiders/agents");
|
||||||
|
if (agentsDir.exists() && agentsDir.isDirectory())
|
||||||
|
{
|
||||||
|
for (File file : agentsDir.listFiles())
|
||||||
|
{
|
||||||
|
Set<String> patterns;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
patterns = readPatterns(file);
|
||||||
|
} catch (IOException ex)
|
||||||
|
{
|
||||||
|
log.error("Agent patterns not read from {}: {}",
|
||||||
|
file.getPath(), ex.getMessage());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (String pattern : patterns)
|
||||||
|
{
|
||||||
|
agents.add(Pattern.compile(pattern));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* TODO Load host name patterns from all files in config/spiders/dns. */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Static Service Method for testing spiders against existing spider files.
|
* Static Service Method for testing spiders against existing spider files.
|
||||||
@@ -172,12 +206,15 @@ public class SpiderDetector {
|
|||||||
* @param request
|
* @param request
|
||||||
* @return true|false if the request was detected to be from a spider
|
* @return true|false if the request was detected to be from a spider
|
||||||
*/
|
*/
|
||||||
public static boolean isSpider(HttpServletRequest request) {
|
public static boolean isSpider(HttpServletRequest request)
|
||||||
|
{
|
||||||
// See if any agent patterns match
|
// See if any agent patterns match
|
||||||
String agent = request.getHeader("User-Agent");
|
String agent = request.getHeader("User-Agent");
|
||||||
if ((null != agent) && (null != agents))
|
if (null != agent)
|
||||||
{
|
{
|
||||||
|
if (null == agents)
|
||||||
|
loadAgentPatterns();
|
||||||
|
|
||||||
for (Pattern candidate : agents)
|
for (Pattern candidate : agents)
|
||||||
{
|
{
|
||||||
if (candidate.matcher(agent).find())
|
if (candidate.matcher(agent).find())
|
||||||
@@ -199,7 +236,6 @@ public class SpiderDetector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return isSpider(request.getRemoteAddr());
|
return isSpider(request.getRemoteAddr());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Reference in New Issue
Block a user