DS-4440 GDPR - Anonymize statistics feature

This commit is contained in:
Samuel
2020-02-25 14:16:18 +01:00
committed by Samuel
parent a5fba06861
commit 810b9c3bbc
4 changed files with 425 additions and 20 deletions

View File

@@ -0,0 +1,274 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.statistics;
import static java.lang.Integer.parseInt;
import static java.lang.Thread.currentThread;
import static java.lang.Thread.sleep;
import static java.util.Arrays.asList;
import static java.util.Calendar.DAY_OF_YEAR;
import static java.util.Collections.singletonList;
import static org.apache.commons.cli.Option.builder;
import static org.apache.commons.lang.StringUtils.isNotBlank;
import static org.apache.commons.lang.time.DateFormatUtils.format;
import static org.apache.log4j.Logger.getLogger;
import static org.dspace.core.LogManager.getHeader;
import static org.dspace.statistics.SolrLoggerServiceImpl.DATE_FORMAT_8601;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.dspace.core.Context;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
import org.dspace.statistics.factory.StatisticsServiceFactory;
import org.dspace.statistics.service.SolrLoggerService;
public class AnonymizeStatistics {
private static Logger log = getLogger(AnonymizeStatistics.class);
private static Context context = new Context();
private static String action = "anonymise_statistics";
private static final String HELP_OPTION = "h";
private static final String SLEEP_OPTION = "s";
private static final String BATCH_SIZE_OPTION = "b";
private static final String THREADS_OPTION = "t";
private static int sleep;
private static SolrLoggerService solrLoggerService =
StatisticsServiceFactory.getInstance().getSolrLoggerService();
private static ConfigurationService configurationService =
DSpaceServicesFactory.getInstance().getConfigurationService();
private static int batchSize = 100;
private static int threads = 2;
private static final Object ANONYMISED =
configurationService.getProperty("anonymise_statistics.dns_mask", "anonymised");
private static final String TIME_LIMIT;
static {
Calendar calendar = Calendar.getInstance();
calendar.add(DAY_OF_YEAR, -configurationService.getIntProperty("anonymise_statistics.time_limit", 90));
TIME_LIMIT = format(calendar, DATE_FORMAT_8601);
}
private AnonymizeStatistics() {
}
public static void main(String... args) throws ParseException {
parseCommandLineOptions(createCommandLineOptions(), args);
anonymiseStatistics();
}
private static Options createCommandLineOptions() {
Options options = new Options();
options.addOption(
builder(HELP_OPTION)
.longOpt("help")
.desc("Print the usage of the script")
.hasArg(false)
.build()
);
options.addOption(
builder(SLEEP_OPTION)
.longOpt("sleep")
.desc("Sleep a certain time between each solr request")
.hasArg(true)
.build()
);
options.addOption(
builder(BATCH_SIZE_OPTION)
.longOpt("batch")
.desc("The amount of Solr records to be processed per batch (defaults to 100)")
.hasArg(true)
.build()
);
options.addOption(
builder(THREADS_OPTION)
.longOpt("threads")
.desc("The amount of threads used by the script (defaults to 2")
.hasArg(true)
.build()
);
return options;
}
private static void parseCommandLineOptions(Options options, String... args) throws ParseException {
CommandLine commandLine = new DefaultParser().parse(options, args);
if (commandLine.hasOption(HELP_OPTION)) {
printHelp(options);
System.exit(0);
}
if (commandLine.hasOption(SLEEP_OPTION)) {
sleep = parseInt(commandLine.getOptionValue(SLEEP_OPTION));
}
if (commandLine.hasOption(BATCH_SIZE_OPTION)) {
batchSize = parseInt(commandLine.getOptionValue(BATCH_SIZE_OPTION));
}
if (commandLine.hasOption(THREADS_OPTION)) {
threads = parseInt(commandLine.getOptionValue(THREADS_OPTION));
}
}
private static void printHelp(Options options) {
new HelpFormatter().printHelp("dsrun " + AnonymizeStatistics.class.getCanonicalName(), options);
}
private static void printInfo(String info) {
System.out.println(info);
log.info(getHeader(context, action, info));
}
private static void printWarning(String warning) {
System.out.println(warning);
log.warn(getHeader(context, action, warning));
}
private static void printError(Exception error) {
error.printStackTrace();
log.error(getHeader(context, action, error.getMessage()), error);
}
private static void anonymiseStatistics() {
try {
long updated = 0;
long total = getDocuments().getResults().getNumFound();
printInfo(total + " documents to update");
ExecutorService executorService = Executors.newFixedThreadPool(threads);
QueryResponse documents;
do {
documents = getDocuments();
Collection<Callable<Boolean>> callables = new ArrayList<>();
Set<String> shards = new HashSet<>();
for (SolrDocument document : documents.getResults()) {
updated++;
callables.add(new DoProcessing(document, updated));
String shard = (String) document.getFieldValue("[shard]");
if (isNotBlank(shard)) {
shards.add(shard);
}
}
executorService.invokeAll(callables);
solrLoggerService.commit();
for (String shard : shards) {
solrLoggerService.commitShard(shard);
}
System.out.println("processed " + updated + " records");
} while (documents.getResults().getNumFound() > 0);
printInfo(updated + " documents updated");
if (updated == total) {
printInfo("all relevant documents were updated");
} else {
printWarning("not all relevant documents were updated, check the DSpace logs for more details");
}
} catch (Exception e) {
printError(e);
}
}
private static QueryResponse getDocuments() throws SolrServerException, IOException {
if (sleep > 0) {
try {
printInfo("sleep " + sleep + "ms");
sleep(sleep);
} catch (InterruptedException e) {
printError(e);
currentThread().interrupt();
}
}
return solrLoggerService.query(
"ip:*",
"time:[* TO " + TIME_LIMIT + "] AND -dns:" + ANONYMISED,
null, batchSize, -1, null, null, null, null, null, false, false, true
);
}
public static class DoProcessing implements Callable<Boolean> {
private final SolrDocument document;
private final long updated;
public DoProcessing(SolrDocument document, long updated) {
this.document = document;
this.updated = updated;
}
@Override
public Boolean call() {
try {
solrLoggerService.update(
"uid:" + document.getFieldValue("uid"),
"replace",
asList(
"ip",
"dns"
),
asList(
singletonList(solrLoggerService.anonymiseIp(document.getFieldValue("ip").toString())),
singletonList(ANONYMISED)
),
false
);
printInfo(updated + ": updated document with uid " + document.getFieldValue("uid") + " " + new Date());
return true;
} catch (Exception e) {
printError(e);
return false;
}
}
}
}

View File

@@ -7,6 +7,8 @@
*/
package org.dspace.statistics;
import static org.apache.commons.lang.StringUtils.substringAfterLast;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
@@ -15,9 +17,12 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.Inet4Address;
import java.net.Inet6Address;
import java.net.InetAddress;
import java.net.URI;
import java.net.URLEncoder;
import java.net.UnknownHostException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -127,8 +132,12 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
protected boolean useProxies;
private static final List<String> statisticYearCores = new ArrayList<>();
private static final Map<String, HttpSolrClient> statisticYearCoreServers = new HashMap<>();
private static boolean statisticYearCoresInit = false;
private static final String IP_V4_REGEX = "^((?:\\d{1,3}\\.){3})\\d{1,3}$";
private static final String IP_V6_REGEX = "^(.*):.*:.*$";
@Autowired(required = true)
protected BitstreamService bitstreamService;
@Autowired(required = true)
@@ -330,7 +339,15 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
if (request != null) {
String ip = clientInfoService.getClientIp(request);
doc1.addField("ip", ip);
if (configurationService.getBooleanProperty("anonymise_statistics.anonymise_on_log", false)) {
try {
doc1.addField("ip", anonymiseIp(ip));
} catch (UnknownHostException e) {
log.warn(e.getMessage(), e);
}
} else {
doc1.addField("ip", ip);
}
//Also store the referrer
if (request.getHeader("referer") != null) {
@@ -338,7 +355,10 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
}
try {
String dns = DnsLookup.reverseDns(ip);
String dns = configurationService.getProperty("anonymise_statistics.dns_mask", "anonymised");
if (!configurationService.getBooleanProperty("anonymise_statistics.anonymise_on_log", false)) {
dns = DnsLookup.reverseDns(ip);
}
doc1.addField("dns", dns.toLowerCase());
} catch (Exception e) {
log.info("Failed DNS Lookup for IP:" + ip);
@@ -406,10 +426,21 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
// Save our basic info that we already have
ip = clientInfoService.getClientIp(ip, xforwardedfor);
doc1.addField("ip", ip);
if (configurationService.getBooleanProperty("anonymise_statistics.anonymise_on_log", false)) {
try {
doc1.addField("ip", anonymiseIp(ip));
} catch (UnknownHostException e) {
log.warn(e.getMessage(), e);
}
} else {
doc1.addField("ip", ip);
}
try {
String dns = DnsLookup.reverseDns(ip);
String dns = configurationService.getProperty("anonymise_statistics.dns_mask", "anonymised");
if (!configurationService.getBooleanProperty("anonymise_statistics.anonymise_on_log", false)) {
dns = DnsLookup.reverseDns(ip);
}
doc1.addField("dns", dns.toLowerCase());
} catch (Exception e) {
log.info("Failed DNS Lookup for IP:" + ip);
@@ -652,6 +683,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
Map<String, String> params = new HashMap<>();
params.put("q", query);
params.put("rows", "10");
params.put("fl","[shard],*");
if (0 < statisticYearCores.size()) {
params.put(ShardParams.SHARDS, StringUtils.join(statisticYearCores.iterator(), ','));
}
@@ -793,29 +825,37 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
public void update(String query, String action,
List<String> fieldNames, List<List<Object>> fieldValuesList)
throws SolrServerException, IOException {
update(query, action, fieldNames, fieldValuesList, true);
}
@Override
public void update(String query, String action,
List<String> fieldNames, List<List<Object>> fieldValuesList, boolean commit)
throws SolrServerException, IOException {
// Since there is NO update
// We need to get our documents
// QueryResponse queryResponse = solr.query()//query(query, null, -1,
// null, null, null);
final List<SolrInputDocument> docsToUpdate = new ArrayList<>();
List<SolrInputDocument> docsToUpdate = new ArrayList<>();
ResultProcessor processor = new ResultProcessor() {
@Override
public void process(List<SolrInputDocument> docs)
throws IOException, SolrServerException {
docsToUpdate.addAll(docs);
public void process(SolrInputDocument document) {
docsToUpdate.add(document);
}
};
processor.execute(query);
// We have all the docs delete the ones we don't need
solr.deleteByQuery(query);
// Add the new (updated onces
for (int i = 0; i < docsToUpdate.size(); i++) {
SolrInputDocument solrDocument = docsToUpdate.get(i);
HttpSolrClient shard = getSolrServer(solrDocument.getFieldValue("[shard]").toString());
shard.deleteByQuery("uid:" + solrDocument.getFieldValue("uid"));
// Now loop over our fieldname actions
for (int j = 0; j < fieldNames.size(); j++) {
String fieldName = fieldNames.get(j);
@@ -832,7 +872,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
} else if (action.equals("remOne")) {
// Remove the field
java.util.Collection<Object> values = solrDocument
.getFieldValues(fieldName);
.getFieldValues(fieldName);
solrDocument.removeField(fieldName);
for (Object value : values) {
// Keep all the values besides the one we need to remove
@@ -842,9 +882,17 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
}
}
}
solr.add(solrDocument);
solrDocument.removeField("_version_");
solrDocument.removeField("[shard]");
shard.add(solrDocument);
if (commit) {
shard.commit();
solr.commit();
}
}
solr.commit();
// System.out.println("SolrLogger.update(\""+query+"\"):"+(new
// Date().getTime() - start)+"ms,"+numbFound+"records");
}
@@ -998,9 +1046,28 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
@Override
public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
int facetMinCount)
throws SolrServerException, IOException {
String dateEnd, List<String> facetQueries, String sort, boolean ascending, int facetMinCount)
throws SolrServerException, IOException {
return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort,
ascending, facetMinCount, true);
}
@Override
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
String dateStart, String dateEnd, List<String> facetQueries, String sort,
boolean ascending, int facetMinCount, boolean defaultFilterQueries)
throws SolrServerException, IOException {
return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort,
ascending, facetMinCount, defaultFilterQueries, false);
}
@Override
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
String dateStart, String dateEnd, List<String> facetQueries, String sort,
boolean ascending, int facetMinCount, boolean defaultFilterQueries, boolean includeShardField)
throws SolrServerException, IOException {
if (solr == null) {
return null;
}
@@ -1010,6 +1077,10 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
.setFacetMinCount(facetMinCount);
addAdditionalSolrYearCores(solrQuery);
if (includeShardField) {
solrQuery.setParam("fl", "[shard],*");
}
// Set the date facet if present
if (dateType != null) {
solrQuery.setParam("facet.range", "time")
@@ -1048,13 +1119,15 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
// not be influenced
// Choose to filter by the Legacy spider IP list (may get too long to properly filter all IP's
if (configurationService.getBooleanProperty("solr-statistics.query.filter.spiderIp", false)) {
if (defaultFilterQueries && configurationService.getBooleanProperty(
"solr-statistics.query.filter.spiderIp", false)) {
solrQuery.addFilterQuery(getIgnoreSpiderIPs());
}
// Choose to filter by isBot field, may be overriden in future
// to allow views on stats based on bots.
if (configurationService.getBooleanProperty("solr-statistics.query.filter.isBot", true)) {
if (defaultFilterQueries && configurationService.getBooleanProperty(
"solr-statistics.query.filter.isBot", true)) {
solrQuery.addFilterQuery("-isBot:true");
}
@@ -1063,7 +1136,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
}
String[] bundles = configurationService.getArrayProperty("solr-statistics.query.filter.bundles");
if (bundles != null && bundles.length > 0) {
if (defaultFilterQueries && bundles != null && bundles.length > 0) {
/**
* The code below creates a query that will allow only records which do not have a bundlename
@@ -1288,6 +1361,7 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
SolrPingResponse ping = returnServer.ping();
log.debug("Ping of Solr Core {} returned with Status {}",
coreName, ping.getStatus());
statisticYearCoreServers.put(coreName, returnServer);
return returnServer;
} catch (IOException | RemoteSolrException | SolrServerException e) {
log.debug("Ping of Solr Core {} failed with {}. New Core Will be Created",
@@ -1508,6 +1582,20 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
}
}
@Override
public void commit() throws Exception {
solr.commit();
}
@Override
public void commitShard(String shard) throws IOException, SolrServerException {
getSolrServer(shard).commit();
}
private HttpSolrClient getSolrServer(String shard) {
return statisticYearCoreServers.get(substringAfterLast(shard, "/"));
}
protected void addDocumentsToFile(Context context, SolrDocumentList docs, File exportOutput)
throws SQLException, ParseException, IOException {
for (SolrDocument doc : docs) {
@@ -1619,4 +1707,17 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
}
statisticYearCoresInit = true;
}
public Object anonymiseIp(String ip) throws UnknownHostException {
InetAddress address = InetAddress.getByName(ip);
if (address instanceof Inet4Address) {
return ip.replaceFirst(IP_V4_REGEX, "$1" + configurationService.getProperty(
"anonymise_statistics.ip_v4_mask", "255"));
} else if (address instanceof Inet6Address) {
return ip.replaceFirst(IP_V6_REGEX, "$1:" + configurationService.getProperty(
"anonymise_statistics.ip_v6_mask", "FFFF:FFFF"));
}
throw new UnknownHostException("unknown ip format");
}
}

View File

@@ -8,6 +8,7 @@
package org.dspace.statistics.service;
import java.io.IOException;
import java.net.UnknownHostException;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;
@@ -117,6 +118,10 @@ public interface SolrLoggerService {
List<String> fieldNames, List<List<Object>> fieldValuesList)
throws SolrServerException, IOException;
public void update(String query, String action,
List<String> fieldNames, List<List<Object>> fieldValuesList, boolean commit)
throws SolrServerException, IOException;
public void query(String query, int max, int facetMinCount)
throws SolrServerException, IOException;
@@ -179,6 +184,18 @@ public interface SolrLoggerService {
int facetMinCount)
throws SolrServerException, IOException;
public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
int facetMinCount, boolean defaultFilterQueries)
throws SolrServerException, IOException;
public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
int facetMinCount, boolean defaultFilterQueries, boolean includeShardField)
throws SolrServerException, IOException;
/**
* Returns in a filterQuery string all the ip addresses that should be ignored
*
@@ -204,4 +221,10 @@ public interface SolrLoggerService {
*/
public void exportHits() throws Exception;
public void commit() throws Exception;
public void commitShard(String shard) throws Exception;
public Object anonymiseIp(String ip) throws UnknownHostException;
}

View File

@@ -359,4 +359,11 @@
<class>org.dspace.app.util.InitializeEntities</class>
</step>
</command>
<command>
<name>anonymize-statistics</name>
<description>Anonymize the ip values of the solr statistics</description>
<step>
<class>com.atmire.dspace.statistics.AnonymizeStatistics</class>
</step>
</command>
</commands>