[DS-2462] query.filter.spiderIp is redundant, incomplete, scales poorly

Remove IP-only, agent-only usage grooming.  Use SpiderDetector to make
all decisions based on the full array of detectors.
This commit is contained in:
Mark H. Wood
2020-07-03 11:25:09 -04:00
committed by Mark H. Wood
parent fbd3d60223
commit 55d4a0dca2
4 changed files with 38 additions and 90 deletions

View File

@@ -772,83 +772,40 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
}
}
@Override
public void markRobotsByIP() {
for (String ip : SpiderDetector.getSpiderIpAddresses()) {
try {
/* Result Process to alter record to be identified as a bot */
ResultProcessor processor = new ResultProcessor() {
@Override
public void process(SolrInputDocument doc) throws IOException, SolrServerException {
doc.removeField("isBot");
doc.addField("isBot", true);
solr.add(doc);
log.info("Marked " + doc.getFieldValue("ip") + " as bot");
}
};
/* query for ip, exclude results previously set as bots. */
processor.execute("ip:" + ip + "* AND -isBot:true");
solr.commit();
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
}
@Override
public void markRobotByUserAgent(String agent) {
try {
/* Result Process to alter record to be identified as a bot */
ResultProcessor processor = new ResultProcessor() {
@Override
public void process(SolrInputDocument doc) throws IOException, SolrServerException {
public void markRobots() {
ResultProcessor processor = new ResultProcessor() {
@Override
public void process(SolrInputDocument doc)
throws IOException, SolrServerException {
String clientIP = (String) doc.getField("ip").getValue();
String hostname = (String) doc.getField("dns").getValue();
String agent = (String) doc.getField("userAgent").getValue();
if (SpiderDetector.isSpider(clientIP, null, hostname, agent)) {
doc.removeField("isBot");
doc.addField("isBot", true);
solr.add(doc);
log.info("Marked {} / {} / {} as a robot in record {}.",
clientIP, hostname, agent,
doc.getField("uid").getValue());
}
};
/* query for ip, exclude results previously set as bots. */
processor.execute("userAgent:" + agent + " AND -isBot:true");
}
};
try {
processor.execute("-isBot:true");
solr.commit();
} catch (Exception e) {
log.error(e.getMessage(), e);
} catch (SolrServerException | IOException ex) {
log.error("Failed while marking robot accesses.", ex);
}
}
@Override
public void deleteRobotsByIsBotFlag() {
public void deleteRobots() {
try {
solr.deleteByQuery("isBot:true");
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
@Override
public void deleteIP(String ip) {
try {
solr.deleteByQuery("ip:" + ip + "*");
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
@Override
public void deleteRobotsByIP() {
for (String ip : SpiderDetector.getSpiderIpAddresses()) {
deleteIP(ip);
} catch (IOException | SolrServerException e) {
log.error("Failed while deleting robot accesses.", e);
}
}
@@ -1117,7 +1074,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
String facetQuery = facetQueries.get(i);
solrQuery.addFacetQuery(facetQuery);
}
if (0 < facetQueries.size()) {
if (!facetQueries.isEmpty()) {
solrQuery.setFacet(true);
}
}
@@ -1135,12 +1092,6 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
// performance and ensure the search result ordering will
// not be influenced
// Choose to filter by the Legacy spider IP list (may get too long to properly filter all IP's
if (defaultFilterQueries && configurationService.getBooleanProperty(
"solr-statistics.query.filter.spiderIp", false)) {
solrQuery.addFilterQuery(getIgnoreSpiderIPs());
}
// Choose to filter by isBot field, may be overriden in future
// to allow views on stats based on bots.
if (defaultFilterQueries && configurationService.getBooleanProperty(
@@ -1156,7 +1107,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
if (defaultFilterQueries && bundles != null && bundles.length > 0) {
/**
* The code below creates a query that will allow only records which do not have a bundlename
* The code below creates a query that will allow only records which do not have a bundle name
* (items, collections, ...) or bitstreams that have a configured bundle name
*/
StringBuilder bundleQuery = new StringBuilder();
@@ -1702,6 +1653,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
statisticYearCoresInit = true;
}
@Override
public Object anonymizeIp(String ip) throws UnknownHostException {
InetAddress address = InetAddress.getByName(ip);
if (address instanceof Inet4Address) {

View File

@@ -111,15 +111,18 @@ public interface SolrLoggerService {
List oldFieldVals, String field)
throws IOException;
public void markRobotsByIP();
/**
* Scan the entire 'statistics' collection for documents that should be
* marked 'isBot:true' according to
* {@link org.dspace.statistics.util.SpiderDetector#isSpider(java.lang.String,
* java.lang.String, java.lang.String, java.lang.String)}.
*/
public void markRobots();
public void markRobotByUserAgent(String agent);
public void deleteRobotsByIsBotFlag();
public void deleteIP(String ip);
public void deleteRobotsByIP();
/**
* Delete all 'statistics' documents having 'isBot:true'.
*/
public void deleteRobots();
/*
* update(String query, boolean addField, String fieldName, Object

View File

@@ -66,7 +66,6 @@ public class StatisticsClient {
options.addOption("m", "mark-spiders", false, "Update isBot Flag in Solr");
options.addOption("f", "delete-spiders-by-flag", false, "Delete Spiders in Solr By isBot Flag");
options.addOption("i", "delete-spiders-by-ip", false, "Delete Spiders in Solr By IP Address");
options.addOption("b", "reindex-bitstreams", false, "Reindex the bitstreams to ensure we have the bundle name");
options.addOption("e", "export", false,
"Export SOLR view statistics data to usage-statistics-intermediate-format");
@@ -87,11 +86,9 @@ public class StatisticsClient {
if (line.hasOption("u")) {
StatisticsClient.updateSpiderFiles();
} else if (line.hasOption('m')) {
solrLoggerService.markRobotsByIP();
solrLoggerService.markRobots();
} else if (line.hasOption('f')) {
solrLoggerService.deleteRobotsByIsBotFlag();
} else if (line.hasOption('i')) {
solrLoggerService.deleteRobotsByIP();
solrLoggerService.deleteRobots();
} else if (line.hasOption('b')) {
solrLoggerService.reindexBitstreamHits(line.hasOption('r'));
} else if (line.hasOption('e')) {
@@ -104,7 +101,7 @@ public class StatisticsClient {
}
/**
* Method to update Spiders in config directory.
* Method to update Spiders in configuration directory.
*/
private static void updateSpiderFiles() {
try {

View File

@@ -19,10 +19,6 @@ solr-statistics.query.filter.bundles=ORIGINAL
# create new Solr cores when sharding the statistics data.
solr-statistics.configset = statistics
# control solr statistics querying to filter out spider IPs
# false by default
#solr-statistics.query.filter.spiderIp = false
# control solr statistics querying to look at "isBot" field to determine
# if record is a bot. true by default.
#solr-statistics.query.filter.isBot = true