DS-4440 GDPR - Anonymize statistics feature - add typedocs

This commit is contained in:
Samuel
2020-09-30 18:11:33 +02:00
parent fd0051091f
commit ebcd1fc6cf
3 changed files with 139 additions and 8 deletions

View File

@@ -46,6 +46,16 @@ import org.dspace.services.factory.DSpaceServicesFactory;
import org.dspace.statistics.factory.StatisticsServiceFactory;
import org.dspace.statistics.service.SolrLoggerService;
/**
* Script to anonymize solr statistics according to GDPR specifications.
* This script will anonymize records older than a certain threshold, configurable with the
* 'anonymize_statistics.time_threshold' config, with a default value of 90 days.
* The records will be anonymized by replacing the last part of the ip address with a mask, this mask is configurable:
* For IPv4 addresses, the config is 'anonymize_statistics.ip_v4_mask', with a default value of '255'
* For IPv6 addresses, the config is 'anonymize_statistics.ip_v6_mask', with a default value of 'FFFF:FFFF'
* The DNS value of the records will also be replaced by a mask, configurable with 'anonymize_statistics.dns_mask',
* and with a default value of 'anonymized'.
*/
public class AnonymizeStatistics {
private static Logger log = getLogger(AnonymizeStatistics.class);
@@ -169,19 +179,25 @@ public class AnonymizeStatistics {
}
/**
* Anonymize the relevant solr documents, returned by the getDocuments method.
*/
private static void anonymizeStatistics() {
try {
long updated = 0;
long total = getDocuments().getResults().getNumFound();
printInfo(total + " documents to update");
// The documents will be processed in seperate threads.
ExecutorService executorService = Executors.newFixedThreadPool(threads);
QueryResponse documents;
do {
documents = getDocuments();
Collection<Callable<Boolean>> callables = new ArrayList<>();
// list of the processing callables to execute
Collection<DoProcessing> callables = new ArrayList<>();
// list of the shards to commit
Set<String> shards = new HashSet<>();
for (SolrDocument document : documents.getResults()) {
@@ -195,10 +211,13 @@ public class AnonymizeStatistics {
}
}
// execute the processing callables
executorService.invokeAll(callables);
// Commit the main core
solrLoggerService.commit();
// Commit all relevant solr shards
for (String shard : shards) {
solrLoggerService.commitShard(shard);
}
@@ -218,6 +237,12 @@ public class AnonymizeStatistics {
}
}
/**
* Get the documents to anonymize.
* @return
* Non-anonymized documents, which are older than the time period configured by the
* 'anonymize_statistics.time_threshold' config (or 90 days, if not configured)
*/
private static QueryResponse getDocuments() throws SolrServerException, IOException {
if (sleep > 0) {
@@ -233,11 +258,17 @@ public class AnonymizeStatistics {
return solrLoggerService.query(
"ip:*",
"time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK,
null, batchSize, -1, null, null, null, null, null, false, false, true
null, batchSize, -1, null, null, null, null,
null, false, -1, false, true
);
}
/**
* {@link Callable} implementation to process a solr document to be anonymized.
* It will return true if the anonymization succeeded.
*/
public static class DoProcessing implements Callable<Boolean> {
private final SolrDocument document;
private final long updated;

View File

@@ -853,7 +853,11 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
for (int i = 0; i < docsToUpdate.size(); i++) {
SolrInputDocument solrDocument = docsToUpdate.get(i);
// Get the relevant shard client
// For a non-sharded core, the shard variable will reference the main core
HttpSolrClient shard = getSolrServer(solrDocument.getFieldValue("[shard]").toString());
// Delete the document from the shard client
shard.deleteByQuery("uid:" + solrDocument.getFieldValue("uid"));
// Now loop over our fieldname actions
@@ -883,9 +887,12 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
}
}
// see https://stackoverflow.com/questions/26941260/normalizing-solr-records-for-sharding-version-issues
solrDocument.removeField("_version_");
// this field will not work with a non-sharded core
solrDocument.removeField("[shard]");
// Add the updated document to the shard client
shard.add(solrDocument);
if (commit) {
@@ -1044,9 +1051,9 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
}
@Override
public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending, int facetMinCount)
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
String dateStart, String dateEnd, List<String> facetQueries, String sort,
boolean ascending, int facetMinCount)
throws SolrServerException, IOException {
return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort,
@@ -1065,7 +1072,8 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
@Override
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
String dateStart, String dateEnd, List<String> facetQueries, String sort,
boolean ascending, int facetMinCount, boolean defaultFilterQueries, boolean includeShardField)
boolean ascending, int facetMinCount, boolean defaultFilterQueries,
boolean includeShardField)
throws SolrServerException, IOException {
if (solr == null) {

View File

@@ -118,6 +118,19 @@ public interface SolrLoggerService {
List<String> fieldNames, List<List<Object>> fieldValuesList)
throws SolrServerException, IOException;
/**
* Update the solr core.
* @param query
* query indicating which documents to update
* @param action
* the update action keyword
* @param fieldNames
* the fields to update
* @param fieldValuesList
* the values for the fields to update
* @param commit
* whether to commit the changes
*/
public void update(String query, String action,
List<String> fieldNames, List<List<Object>> fieldValuesList, boolean commit)
throws SolrServerException, IOException;
@@ -178,18 +191,84 @@ public interface SolrLoggerService {
public ObjectCount queryTotal(String query, String filterQuery, int facetMinCount)
throws SolrServerException, IOException;
/**
* Perform a solr query.
*
* @param query the query to be used
* @param filterQuery filter query
* @param facetField field to facet the results by
* @param rows the max number of results to return
* @param max the max number of facets to return
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
* relatively on today
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
* relatively on today
* @param facetQueries list of facet queries
* @param sort the sort field
* @param ascending the sort direction (true: ascending)
* @param facetMinCount Minimum count of results facet must have to return a result
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
* @throws java.io.IOException passed through.
*/
public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
int facetMinCount)
throws SolrServerException, IOException;
/**
* Perform a solr query.
*
* @param query the query to be used
* @param filterQuery filter query
* @param facetField field to facet the results by
* @param rows the max number of results to return
* @param max the max number of facets to return
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
* relatively on today
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
* relatively on today
* @param facetQueries list of facet queries
* @param sort the sort field
* @param ascending the sort direction (true: ascending)
* @param facetMinCount Minimum count of results facet must have to return a result
* @param defaultFilterQueries
* use the default filter queries
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
* @throws java.io.IOException passed through.
*/
public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
int facetMinCount, boolean defaultFilterQueries)
throws SolrServerException, IOException;
/**
* Perform a solr query.
*
* @param query the query to be used
* @param filterQuery filter query
* @param facetField field to facet the results by
* @param rows the max number of results to return
* @param max the max number of facets to return
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
* relatively on today
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
* relatively on today
* @param facetQueries list of facet queries
* @param sort the sort field
* @param ascending the sort direction (true: ascending)
* @param facetMinCount Minimum count of results facet must have to return a result
* @param defaultFilterQueries
* use the default filter queries
* @param includeShardField
* include the shard field in the result documents
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
* @throws java.io.IOException passed through.
*/
public QueryResponse query(String query, String filterQuery,
String facetField, int rows, int max, String dateType, String dateStart,
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
@@ -221,10 +300,23 @@ public interface SolrLoggerService {
*/
public void exportHits() throws Exception;
/**
* Commit the solr core.
*/
public void commit() throws Exception;
/**
* Commit a solr shard.
* @param shard
* The shard to commit.
*/
public void commitShard(String shard) throws Exception;
/**
* Anonymize a given ip
* @param ip
* The ip to anonymize.
*/
public Object anonymizeIp(String ip) throws UnknownHostException;
}