DS-4440 GDPR - Anonymize statistics feature - add typedocs

This commit is contained in:
Samuel
2020-09-30 18:11:33 +02:00
parent fd0051091f
commit ebcd1fc6cf
3 changed files with 139 additions and 8 deletions

View File

@@ -46,6 +46,16 @@ import org.dspace.services.factory.DSpaceServicesFactory;
import org.dspace.statistics.factory.StatisticsServiceFactory; import org.dspace.statistics.factory.StatisticsServiceFactory;
import org.dspace.statistics.service.SolrLoggerService; import org.dspace.statistics.service.SolrLoggerService;
/**
* Script to anonymize solr statistics according to GDPR specifications.
* This script will anonymize records older than a certain threshold, configurable with the
* 'anonymize_statistics.time_threshold' config, with a default value of 90 days.
* The records will be anonymized by replacing the last part of the ip address with a mask, this mask is configurable:
* For IPv4 addresses, the config is 'anonymize_statistics.ip_v4_mask', with a default value of '255'
* For IPv6 addresses, the config is 'anonymize_statistics.ip_v6_mask', with a default value of 'FFFF:FFFF'
* The DNS value of the records will also be replaced by a mask, configurable with 'anonymize_statistics.dns_mask',
* and with a default value of 'anonymized'.
*/
public class AnonymizeStatistics { public class AnonymizeStatistics {
private static Logger log = getLogger(AnonymizeStatistics.class); private static Logger log = getLogger(AnonymizeStatistics.class);
@@ -169,19 +179,25 @@ public class AnonymizeStatistics {
} }
/**
* Anonymize the relevant solr documents, returned by the getDocuments method.
*/
private static void anonymizeStatistics() { private static void anonymizeStatistics() {
try { try {
long updated = 0; long updated = 0;
long total = getDocuments().getResults().getNumFound(); long total = getDocuments().getResults().getNumFound();
printInfo(total + " documents to update"); printInfo(total + " documents to update");
// The documents will be processed in seperate threads.
ExecutorService executorService = Executors.newFixedThreadPool(threads); ExecutorService executorService = Executors.newFixedThreadPool(threads);
QueryResponse documents; QueryResponse documents;
do { do {
documents = getDocuments(); documents = getDocuments();
Collection<Callable<Boolean>> callables = new ArrayList<>(); // list of the processing callables to execute
Collection<DoProcessing> callables = new ArrayList<>();
// list of the shards to commit
Set<String> shards = new HashSet<>(); Set<String> shards = new HashSet<>();
for (SolrDocument document : documents.getResults()) { for (SolrDocument document : documents.getResults()) {
@@ -195,10 +211,13 @@ public class AnonymizeStatistics {
} }
} }
// execute the processing callables
executorService.invokeAll(callables); executorService.invokeAll(callables);
// Commit the main core
solrLoggerService.commit(); solrLoggerService.commit();
// Commit all relevant solr shards
for (String shard : shards) { for (String shard : shards) {
solrLoggerService.commitShard(shard); solrLoggerService.commitShard(shard);
} }
@@ -218,6 +237,12 @@ public class AnonymizeStatistics {
} }
} }
/**
* Get the documents to anonymize.
* @return
* Non-anonymized documents, which are older than the time period configured by the
* 'anonymize_statistics.time_threshold' config (or 90 days, if not configured)
*/
private static QueryResponse getDocuments() throws SolrServerException, IOException { private static QueryResponse getDocuments() throws SolrServerException, IOException {
if (sleep > 0) { if (sleep > 0) {
@@ -231,13 +256,19 @@ public class AnonymizeStatistics {
} }
return solrLoggerService.query( return solrLoggerService.query(
"ip:*", "ip:*",
"time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK, "time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK,
null, batchSize, -1, null, null, null, null, null, false, false, true null, batchSize, -1, null, null, null, null,
null, false, -1, false, true
); );
} }
/**
* {@link Callable} implementation to process a solr document to be anonymized.
* It will return true if the anonymization succeeded.
*/
public static class DoProcessing implements Callable<Boolean> { public static class DoProcessing implements Callable<Boolean> {
private final SolrDocument document; private final SolrDocument document;
private final long updated; private final long updated;

View File

@@ -853,7 +853,11 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
for (int i = 0; i < docsToUpdate.size(); i++) { for (int i = 0; i < docsToUpdate.size(); i++) {
SolrInputDocument solrDocument = docsToUpdate.get(i); SolrInputDocument solrDocument = docsToUpdate.get(i);
// Get the relevant shard client
// For a non-sharded core, the shard variable will reference the main core
HttpSolrClient shard = getSolrServer(solrDocument.getFieldValue("[shard]").toString()); HttpSolrClient shard = getSolrServer(solrDocument.getFieldValue("[shard]").toString());
// Delete the document from the shard client
shard.deleteByQuery("uid:" + solrDocument.getFieldValue("uid")); shard.deleteByQuery("uid:" + solrDocument.getFieldValue("uid"));
// Now loop over our fieldname actions // Now loop over our fieldname actions
@@ -883,9 +887,12 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
} }
} }
// see https://stackoverflow.com/questions/26941260/normalizing-solr-records-for-sharding-version-issues
solrDocument.removeField("_version_"); solrDocument.removeField("_version_");
// this field will not work with a non-sharded core
solrDocument.removeField("[shard]"); solrDocument.removeField("[shard]");
// Add the updated document to the shard client
shard.add(solrDocument); shard.add(solrDocument);
if (commit) { if (commit) {
@@ -1044,9 +1051,9 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
} }
@Override @Override
public QueryResponse query(String query, String filterQuery, public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
String facetField, int rows, int max, String dateType, String dateStart, String dateStart, String dateEnd, List<String> facetQueries, String sort,
String dateEnd, List<String> facetQueries, String sort, boolean ascending, int facetMinCount) boolean ascending, int facetMinCount)
throws SolrServerException, IOException { throws SolrServerException, IOException {
return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort, return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort,
@@ -1065,7 +1072,8 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
@Override @Override
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType, public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
String dateStart, String dateEnd, List<String> facetQueries, String sort, String dateStart, String dateEnd, List<String> facetQueries, String sort,
boolean ascending, int facetMinCount, boolean defaultFilterQueries, boolean includeShardField) boolean ascending, int facetMinCount, boolean defaultFilterQueries,
boolean includeShardField)
throws SolrServerException, IOException { throws SolrServerException, IOException {
if (solr == null) { if (solr == null) {

View File

@@ -118,6 +118,19 @@ public interface SolrLoggerService {
List<String> fieldNames, List<List<Object>> fieldValuesList) List<String> fieldNames, List<List<Object>> fieldValuesList)
throws SolrServerException, IOException; throws SolrServerException, IOException;
/**
* Update the solr core.
* @param query
* query indicating which documents to update
* @param action
* the update action keyword
* @param fieldNames
* the fields to update
* @param fieldValuesList
* the values for the fields to update
* @param commit
* whether to commit the changes
*/
public void update(String query, String action, public void update(String query, String action,
List<String> fieldNames, List<List<Object>> fieldValuesList, boolean commit) List<String> fieldNames, List<List<Object>> fieldValuesList, boolean commit)
throws SolrServerException, IOException; throws SolrServerException, IOException;
@@ -178,18 +191,84 @@ public interface SolrLoggerService {
public ObjectCount queryTotal(String query, String filterQuery, int facetMinCount) public ObjectCount queryTotal(String query, String filterQuery, int facetMinCount)
throws SolrServerException, IOException; throws SolrServerException, IOException;
/**
* Perform a solr query.
*
* @param query the query to be used
* @param filterQuery filter query
* @param facetField field to facet the results by
* @param rows the max number of results to return
* @param max the max number of facets to return
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
* relatively on today
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
* relatively on today
* @param facetQueries list of facet queries
* @param sort the sort field
* @param ascending the sort direction (true: ascending)
* @param facetMinCount Minimum count of results facet must have to return a result
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
* @throws java.io.IOException passed through.
*/
public QueryResponse query(String query, String filterQuery, public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart, String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending, String dateEnd, List<String> facetQueries, String sort, boolean ascending,
int facetMinCount) int facetMinCount)
throws SolrServerException, IOException; throws SolrServerException, IOException;
/**
* Perform a solr query.
*
* @param query the query to be used
* @param filterQuery filter query
* @param facetField field to facet the results by
* @param rows the max number of results to return
* @param max the max number of facets to return
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
* relatively on today
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
* relatively on today
* @param facetQueries list of facet queries
* @param sort the sort field
* @param ascending the sort direction (true: ascending)
* @param facetMinCount Minimum count of results facet must have to return a result
* @param defaultFilterQueries
* use the default filter queries
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
* @throws java.io.IOException passed through.
*/
public QueryResponse query(String query, String filterQuery, public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart, String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending, String dateEnd, List<String> facetQueries, String sort, boolean ascending,
int facetMinCount, boolean defaultFilterQueries) int facetMinCount, boolean defaultFilterQueries)
throws SolrServerException, IOException; throws SolrServerException, IOException;
/**
* Perform a solr query.
*
* @param query the query to be used
* @param filterQuery filter query
* @param facetField field to facet the results by
* @param rows the max number of results to return
* @param max the max number of facets to return
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
* relatively on today
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
* relatively on today
* @param facetQueries list of facet queries
* @param sort the sort field
* @param ascending the sort direction (true: ascending)
* @param facetMinCount Minimum count of results facet must have to return a result
* @param defaultFilterQueries
* use the default filter queries
* @param includeShardField
* include the shard field in the result documents
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
* @throws java.io.IOException passed through.
*/
public QueryResponse query(String query, String filterQuery, public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart, String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending, String dateEnd, List<String> facetQueries, String sort, boolean ascending,
@@ -221,10 +300,23 @@ public interface SolrLoggerService {
*/ */
public void exportHits() throws Exception; public void exportHits() throws Exception;
/**
* Commit the solr core.
*/
public void commit() throws Exception; public void commit() throws Exception;
/**
* Commit a solr shard.
* @param shard
* The shard to commit.
*/
public void commitShard(String shard) throws Exception; public void commitShard(String shard) throws Exception;
/**
* Anonymize a given ip
* @param ip
* The ip to anonymize.
*/
public Object anonymizeIp(String ip) throws UnknownHostException; public Object anonymizeIp(String ip) throws UnknownHostException;
} }