mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-17 06:53:09 +00:00
DS-4440 GDPR - Anonymize statistics feature - add typedocs
This commit is contained in:
@@ -46,6 +46,16 @@ import org.dspace.services.factory.DSpaceServicesFactory;
|
||||
import org.dspace.statistics.factory.StatisticsServiceFactory;
|
||||
import org.dspace.statistics.service.SolrLoggerService;
|
||||
|
||||
/**
|
||||
* Script to anonymize solr statistics according to GDPR specifications.
|
||||
* This script will anonymize records older than a certain threshold, configurable with the
|
||||
* 'anonymize_statistics.time_threshold' config, with a default value of 90 days.
|
||||
* The records will be anonymized by replacing the last part of the ip address with a mask, this mask is configurable:
|
||||
* For IPv4 addresses, the config is 'anonymize_statistics.ip_v4_mask', with a default value of '255'
|
||||
* For IPv6 addresses, the config is 'anonymize_statistics.ip_v6_mask', with a default value of 'FFFF:FFFF'
|
||||
* The DNS value of the records will also be replaced by a mask, configurable with 'anonymize_statistics.dns_mask',
|
||||
* and with a default value of 'anonymized'.
|
||||
*/
|
||||
public class AnonymizeStatistics {
|
||||
|
||||
private static Logger log = getLogger(AnonymizeStatistics.class);
|
||||
@@ -169,19 +179,25 @@ public class AnonymizeStatistics {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Anonymize the relevant solr documents, returned by the getDocuments method.
|
||||
*/
|
||||
private static void anonymizeStatistics() {
|
||||
try {
|
||||
long updated = 0;
|
||||
long total = getDocuments().getResults().getNumFound();
|
||||
printInfo(total + " documents to update");
|
||||
|
||||
// The documents will be processed in seperate threads.
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(threads);
|
||||
|
||||
QueryResponse documents;
|
||||
do {
|
||||
documents = getDocuments();
|
||||
|
||||
Collection<Callable<Boolean>> callables = new ArrayList<>();
|
||||
// list of the processing callables to execute
|
||||
Collection<DoProcessing> callables = new ArrayList<>();
|
||||
// list of the shards to commit
|
||||
Set<String> shards = new HashSet<>();
|
||||
|
||||
for (SolrDocument document : documents.getResults()) {
|
||||
@@ -195,10 +211,13 @@ public class AnonymizeStatistics {
|
||||
}
|
||||
}
|
||||
|
||||
// execute the processing callables
|
||||
executorService.invokeAll(callables);
|
||||
|
||||
// Commit the main core
|
||||
solrLoggerService.commit();
|
||||
|
||||
// Commit all relevant solr shards
|
||||
for (String shard : shards) {
|
||||
solrLoggerService.commitShard(shard);
|
||||
}
|
||||
@@ -218,6 +237,12 @@ public class AnonymizeStatistics {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the documents to anonymize.
|
||||
* @return
|
||||
* Non-anonymized documents, which are older than the time period configured by the
|
||||
* 'anonymize_statistics.time_threshold' config (or 90 days, if not configured)
|
||||
*/
|
||||
private static QueryResponse getDocuments() throws SolrServerException, IOException {
|
||||
|
||||
if (sleep > 0) {
|
||||
@@ -233,11 +258,17 @@ public class AnonymizeStatistics {
|
||||
return solrLoggerService.query(
|
||||
"ip:*",
|
||||
"time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK,
|
||||
null, batchSize, -1, null, null, null, null, null, false, false, true
|
||||
null, batchSize, -1, null, null, null, null,
|
||||
null, false, -1, false, true
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link Callable} implementation to process a solr document to be anonymized.
|
||||
* It will return true if the anonymization succeeded.
|
||||
*/
|
||||
public static class DoProcessing implements Callable<Boolean> {
|
||||
|
||||
private final SolrDocument document;
|
||||
private final long updated;
|
||||
|
||||
|
@@ -853,7 +853,11 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
for (int i = 0; i < docsToUpdate.size(); i++) {
|
||||
SolrInputDocument solrDocument = docsToUpdate.get(i);
|
||||
|
||||
// Get the relevant shard client
|
||||
// For a non-sharded core, the shard variable will reference the main core
|
||||
HttpSolrClient shard = getSolrServer(solrDocument.getFieldValue("[shard]").toString());
|
||||
|
||||
// Delete the document from the shard client
|
||||
shard.deleteByQuery("uid:" + solrDocument.getFieldValue("uid"));
|
||||
|
||||
// Now loop over our fieldname actions
|
||||
@@ -883,9 +887,12 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
}
|
||||
}
|
||||
|
||||
// see https://stackoverflow.com/questions/26941260/normalizing-solr-records-for-sharding-version-issues
|
||||
solrDocument.removeField("_version_");
|
||||
// this field will not work with a non-sharded core
|
||||
solrDocument.removeField("[shard]");
|
||||
|
||||
// Add the updated document to the shard client
|
||||
shard.add(solrDocument);
|
||||
|
||||
if (commit) {
|
||||
@@ -1044,9 +1051,9 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
}
|
||||
|
||||
@Override
|
||||
public QueryResponse query(String query, String filterQuery,
|
||||
String facetField, int rows, int max, String dateType, String dateStart,
|
||||
String dateEnd, List<String> facetQueries, String sort, boolean ascending, int facetMinCount)
|
||||
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
|
||||
String dateStart, String dateEnd, List<String> facetQueries, String sort,
|
||||
boolean ascending, int facetMinCount)
|
||||
throws SolrServerException, IOException {
|
||||
|
||||
return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort,
|
||||
@@ -1065,7 +1072,8 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
||||
@Override
|
||||
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
|
||||
String dateStart, String dateEnd, List<String> facetQueries, String sort,
|
||||
boolean ascending, int facetMinCount, boolean defaultFilterQueries, boolean includeShardField)
|
||||
boolean ascending, int facetMinCount, boolean defaultFilterQueries,
|
||||
boolean includeShardField)
|
||||
throws SolrServerException, IOException {
|
||||
|
||||
if (solr == null) {
|
||||
|
@@ -118,6 +118,19 @@ public interface SolrLoggerService {
|
||||
List<String> fieldNames, List<List<Object>> fieldValuesList)
|
||||
throws SolrServerException, IOException;
|
||||
|
||||
/**
|
||||
* Update the solr core.
|
||||
* @param query
|
||||
* query indicating which documents to update
|
||||
* @param action
|
||||
* the update action keyword
|
||||
* @param fieldNames
|
||||
* the fields to update
|
||||
* @param fieldValuesList
|
||||
* the values for the fields to update
|
||||
* @param commit
|
||||
* whether to commit the changes
|
||||
*/
|
||||
public void update(String query, String action,
|
||||
List<String> fieldNames, List<List<Object>> fieldValuesList, boolean commit)
|
||||
throws SolrServerException, IOException;
|
||||
@@ -178,18 +191,84 @@ public interface SolrLoggerService {
|
||||
public ObjectCount queryTotal(String query, String filterQuery, int facetMinCount)
|
||||
throws SolrServerException, IOException;
|
||||
|
||||
/**
|
||||
* Perform a solr query.
|
||||
*
|
||||
* @param query the query to be used
|
||||
* @param filterQuery filter query
|
||||
* @param facetField field to facet the results by
|
||||
* @param rows the max number of results to return
|
||||
* @param max the max number of facets to return
|
||||
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
|
||||
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
|
||||
* relatively on today
|
||||
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
|
||||
* relatively on today
|
||||
* @param facetQueries list of facet queries
|
||||
* @param sort the sort field
|
||||
* @param ascending the sort direction (true: ascending)
|
||||
* @param facetMinCount Minimum count of results facet must have to return a result
|
||||
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
|
||||
* @throws java.io.IOException passed through.
|
||||
*/
|
||||
public QueryResponse query(String query, String filterQuery,
|
||||
String facetField, int rows, int max, String dateType, String dateStart,
|
||||
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
||||
int facetMinCount)
|
||||
throws SolrServerException, IOException;
|
||||
|
||||
/**
|
||||
* Perform a solr query.
|
||||
*
|
||||
* @param query the query to be used
|
||||
* @param filterQuery filter query
|
||||
* @param facetField field to facet the results by
|
||||
* @param rows the max number of results to return
|
||||
* @param max the max number of facets to return
|
||||
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
|
||||
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
|
||||
* relatively on today
|
||||
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
|
||||
* relatively on today
|
||||
* @param facetQueries list of facet queries
|
||||
* @param sort the sort field
|
||||
* @param ascending the sort direction (true: ascending)
|
||||
* @param facetMinCount Minimum count of results facet must have to return a result
|
||||
* @param defaultFilterQueries
|
||||
* use the default filter queries
|
||||
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
|
||||
* @throws java.io.IOException passed through.
|
||||
*/
|
||||
public QueryResponse query(String query, String filterQuery,
|
||||
String facetField, int rows, int max, String dateType, String dateStart,
|
||||
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
||||
int facetMinCount, boolean defaultFilterQueries)
|
||||
throws SolrServerException, IOException;
|
||||
|
||||
/**
|
||||
* Perform a solr query.
|
||||
*
|
||||
* @param query the query to be used
|
||||
* @param filterQuery filter query
|
||||
* @param facetField field to facet the results by
|
||||
* @param rows the max number of results to return
|
||||
* @param max the max number of facets to return
|
||||
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
|
||||
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
|
||||
* relatively on today
|
||||
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
|
||||
* relatively on today
|
||||
* @param facetQueries list of facet queries
|
||||
* @param sort the sort field
|
||||
* @param ascending the sort direction (true: ascending)
|
||||
* @param facetMinCount Minimum count of results facet must have to return a result
|
||||
* @param defaultFilterQueries
|
||||
* use the default filter queries
|
||||
* @param includeShardField
|
||||
* include the shard field in the result documents
|
||||
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
|
||||
* @throws java.io.IOException passed through.
|
||||
*/
|
||||
public QueryResponse query(String query, String filterQuery,
|
||||
String facetField, int rows, int max, String dateType, String dateStart,
|
||||
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
||||
@@ -221,10 +300,23 @@ public interface SolrLoggerService {
|
||||
*/
|
||||
public void exportHits() throws Exception;
|
||||
|
||||
/**
|
||||
* Commit the solr core.
|
||||
*/
|
||||
public void commit() throws Exception;
|
||||
|
||||
/**
|
||||
* Commit a solr shard.
|
||||
* @param shard
|
||||
* The shard to commit.
|
||||
*/
|
||||
public void commitShard(String shard) throws Exception;
|
||||
|
||||
/**
|
||||
* Anonymize a given ip
|
||||
* @param ip
|
||||
* The ip to anonymize.
|
||||
*/
|
||||
public Object anonymizeIp(String ip) throws UnknownHostException;
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user