mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 01:54:22 +00:00
[DS-2462] query.filter.spiderIp is redundant, incomplete, scales poorly
Remove IP-only, agent-only usage grooming. Use SpiderDetector to make all decisions based on the full array of detectors.
This commit is contained in:

committed by
Mark H. Wood

parent
fbd3d60223
commit
55d4a0dca2
@@ -772,83 +772,40 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void markRobotsByIP() {
|
||||
for (String ip : SpiderDetector.getSpiderIpAddresses()) {
|
||||
|
||||
try {
|
||||
|
||||
/* Result Process to alter record to be identified as a bot */
|
||||
ResultProcessor processor = new ResultProcessor() {
|
||||
@Override
|
||||
public void process(SolrInputDocument doc) throws IOException, SolrServerException {
|
||||
doc.removeField("isBot");
|
||||
doc.addField("isBot", true);
|
||||
solr.add(doc);
|
||||
log.info("Marked " + doc.getFieldValue("ip") + " as bot");
|
||||
}
|
||||
};
|
||||
|
||||
/* query for ip, exclude results previously set as bots. */
|
||||
processor.execute("ip:" + ip + "* AND -isBot:true");
|
||||
|
||||
solr.commit();
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void markRobotByUserAgent(String agent) {
|
||||
try {
|
||||
|
||||
/* Result Process to alter record to be identified as a bot */
|
||||
ResultProcessor processor = new ResultProcessor() {
|
||||
@Override
|
||||
public void process(SolrInputDocument doc) throws IOException, SolrServerException {
|
||||
public void markRobots() {
|
||||
ResultProcessor processor = new ResultProcessor() {
|
||||
@Override
|
||||
public void process(SolrInputDocument doc)
|
||||
throws IOException, SolrServerException {
|
||||
String clientIP = (String) doc.getField("ip").getValue();
|
||||
String hostname = (String) doc.getField("dns").getValue();
|
||||
String agent = (String) doc.getField("userAgent").getValue();
|
||||
if (SpiderDetector.isSpider(clientIP, null, hostname, agent)) {
|
||||
doc.removeField("isBot");
|
||||
doc.addField("isBot", true);
|
||||
solr.add(doc);
|
||||
log.info("Marked {} / {} / {} as a robot in record {}.",
|
||||
clientIP, hostname, agent,
|
||||
doc.getField("uid").getValue());
|
||||
}
|
||||
};
|
||||
|
||||
/* query for ip, exclude results previously set as bots. */
|
||||
processor.execute("userAgent:" + agent + " AND -isBot:true");
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
processor.execute("-isBot:true");
|
||||
solr.commit();
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
} catch (SolrServerException | IOException ex) {
|
||||
log.error("Failed while marking robot accesses.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteRobotsByIsBotFlag() {
|
||||
public void deleteRobots() {
|
||||
try {
|
||||
solr.deleteByQuery("isBot:true");
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteIP(String ip) {
|
||||
try {
|
||||
solr.deleteByQuery("ip:" + ip + "*");
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteRobotsByIP() {
|
||||
for (String ip : SpiderDetector.getSpiderIpAddresses()) {
|
||||
deleteIP(ip);
|
||||
} catch (IOException | SolrServerException e) {
|
||||
log.error("Failed while deleting robot accesses.", e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1117,7 +1074,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
String facetQuery = facetQueries.get(i);
|
||||
solrQuery.addFacetQuery(facetQuery);
|
||||
}
|
||||
if (0 < facetQueries.size()) {
|
||||
if (!facetQueries.isEmpty()) {
|
||||
solrQuery.setFacet(true);
|
||||
}
|
||||
}
|
||||
@@ -1135,12 +1092,6 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
// performance and ensure the search result ordering will
|
||||
// not be influenced
|
||||
|
||||
// Choose to filter by the Legacy spider IP list (may get too long to properly filter all IP's
|
||||
if (defaultFilterQueries && configurationService.getBooleanProperty(
|
||||
"solr-statistics.query.filter.spiderIp", false)) {
|
||||
solrQuery.addFilterQuery(getIgnoreSpiderIPs());
|
||||
}
|
||||
|
||||
// Choose to filter by isBot field, may be overriden in future
|
||||
// to allow views on stats based on bots.
|
||||
if (defaultFilterQueries && configurationService.getBooleanProperty(
|
||||
@@ -1156,7 +1107,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
if (defaultFilterQueries && bundles != null && bundles.length > 0) {
|
||||
|
||||
/**
|
||||
* The code below creates a query that will allow only records which do not have a bundlename
|
||||
* The code below creates a query that will allow only records which do not have a bundle name
|
||||
* (items, collections, ...) or bitstreams that have a configured bundle name
|
||||
*/
|
||||
StringBuilder bundleQuery = new StringBuilder();
|
||||
@@ -1702,6 +1653,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
statisticYearCoresInit = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object anonymizeIp(String ip) throws UnknownHostException {
|
||||
InetAddress address = InetAddress.getByName(ip);
|
||||
if (address instanceof Inet4Address) {
|
||||
|
@@ -111,15 +111,18 @@ public interface SolrLoggerService {
|
||||
List oldFieldVals, String field)
|
||||
throws IOException;
|
||||
|
||||
public void markRobotsByIP();
|
||||
/**
|
||||
* Scan the entire 'statistics' collection for documents that should be
|
||||
* marked 'isBot:true' according to
|
||||
* {@link org.dspace.statistics.util.SpiderDetector#isSpider(java.lang.String,
|
||||
* java.lang.String, java.lang.String, java.lang.String)}.
|
||||
*/
|
||||
public void markRobots();
|
||||
|
||||
public void markRobotByUserAgent(String agent);
|
||||
|
||||
public void deleteRobotsByIsBotFlag();
|
||||
|
||||
public void deleteIP(String ip);
|
||||
|
||||
public void deleteRobotsByIP();
|
||||
/**
|
||||
* Delete all 'statistics' documents having 'isBot:true'.
|
||||
*/
|
||||
public void deleteRobots();
|
||||
|
||||
/*
|
||||
* update(String query, boolean addField, String fieldName, Object
|
||||
|
@@ -66,7 +66,6 @@ public class StatisticsClient {
|
||||
|
||||
options.addOption("m", "mark-spiders", false, "Update isBot Flag in Solr");
|
||||
options.addOption("f", "delete-spiders-by-flag", false, "Delete Spiders in Solr By isBot Flag");
|
||||
options.addOption("i", "delete-spiders-by-ip", false, "Delete Spiders in Solr By IP Address");
|
||||
options.addOption("b", "reindex-bitstreams", false, "Reindex the bitstreams to ensure we have the bundle name");
|
||||
options.addOption("e", "export", false,
|
||||
"Export SOLR view statistics data to usage-statistics-intermediate-format");
|
||||
@@ -87,11 +86,9 @@ public class StatisticsClient {
|
||||
if (line.hasOption("u")) {
|
||||
StatisticsClient.updateSpiderFiles();
|
||||
} else if (line.hasOption('m')) {
|
||||
solrLoggerService.markRobotsByIP();
|
||||
solrLoggerService.markRobots();
|
||||
} else if (line.hasOption('f')) {
|
||||
solrLoggerService.deleteRobotsByIsBotFlag();
|
||||
} else if (line.hasOption('i')) {
|
||||
solrLoggerService.deleteRobotsByIP();
|
||||
solrLoggerService.deleteRobots();
|
||||
} else if (line.hasOption('b')) {
|
||||
solrLoggerService.reindexBitstreamHits(line.hasOption('r'));
|
||||
} else if (line.hasOption('e')) {
|
||||
@@ -104,7 +101,7 @@ public class StatisticsClient {
|
||||
}
|
||||
|
||||
/**
|
||||
* Method to update Spiders in config directory.
|
||||
* Method to update Spiders in configuration directory.
|
||||
*/
|
||||
private static void updateSpiderFiles() {
|
||||
try {
|
||||
|
@@ -19,10 +19,6 @@ solr-statistics.query.filter.bundles=ORIGINAL
|
||||
# create new Solr cores when sharding the statistics data.
|
||||
solr-statistics.configset = statistics
|
||||
|
||||
# control solr statistics querying to filter out spider IPs
|
||||
# false by default
|
||||
#solr-statistics.query.filter.spiderIp = false
|
||||
|
||||
# control solr statistics querying to look at "isBot" field to determine
|
||||
# if record is a bot. true by default.
|
||||
#solr-statistics.query.filter.isBot = true
|
||||
|
Reference in New Issue
Block a user