mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-15 22:13:08 +00:00
DS-4440 GDPR - Anonymize statistics feature - add typedocs
This commit is contained in:
@@ -46,6 +46,16 @@ import org.dspace.services.factory.DSpaceServicesFactory;
|
|||||||
import org.dspace.statistics.factory.StatisticsServiceFactory;
|
import org.dspace.statistics.factory.StatisticsServiceFactory;
|
||||||
import org.dspace.statistics.service.SolrLoggerService;
|
import org.dspace.statistics.service.SolrLoggerService;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Script to anonymize solr statistics according to GDPR specifications.
|
||||||
|
* This script will anonymize records older than a certain threshold, configurable with the
|
||||||
|
* 'anonymize_statistics.time_threshold' config, with a default value of 90 days.
|
||||||
|
* The records will be anonymized by replacing the last part of the ip address with a mask, this mask is configurable:
|
||||||
|
* For IPv4 addresses, the config is 'anonymize_statistics.ip_v4_mask', with a default value of '255'
|
||||||
|
* For IPv6 addresses, the config is 'anonymize_statistics.ip_v6_mask', with a default value of 'FFFF:FFFF'
|
||||||
|
* The DNS value of the records will also be replaced by a mask, configurable with 'anonymize_statistics.dns_mask',
|
||||||
|
* and with a default value of 'anonymized'.
|
||||||
|
*/
|
||||||
public class AnonymizeStatistics {
|
public class AnonymizeStatistics {
|
||||||
|
|
||||||
private static Logger log = getLogger(AnonymizeStatistics.class);
|
private static Logger log = getLogger(AnonymizeStatistics.class);
|
||||||
@@ -169,19 +179,25 @@ public class AnonymizeStatistics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Anonymize the relevant solr documents, returned by the getDocuments method.
|
||||||
|
*/
|
||||||
private static void anonymizeStatistics() {
|
private static void anonymizeStatistics() {
|
||||||
try {
|
try {
|
||||||
long updated = 0;
|
long updated = 0;
|
||||||
long total = getDocuments().getResults().getNumFound();
|
long total = getDocuments().getResults().getNumFound();
|
||||||
printInfo(total + " documents to update");
|
printInfo(total + " documents to update");
|
||||||
|
|
||||||
|
// The documents will be processed in seperate threads.
|
||||||
ExecutorService executorService = Executors.newFixedThreadPool(threads);
|
ExecutorService executorService = Executors.newFixedThreadPool(threads);
|
||||||
|
|
||||||
QueryResponse documents;
|
QueryResponse documents;
|
||||||
do {
|
do {
|
||||||
documents = getDocuments();
|
documents = getDocuments();
|
||||||
|
|
||||||
Collection<Callable<Boolean>> callables = new ArrayList<>();
|
// list of the processing callables to execute
|
||||||
|
Collection<DoProcessing> callables = new ArrayList<>();
|
||||||
|
// list of the shards to commit
|
||||||
Set<String> shards = new HashSet<>();
|
Set<String> shards = new HashSet<>();
|
||||||
|
|
||||||
for (SolrDocument document : documents.getResults()) {
|
for (SolrDocument document : documents.getResults()) {
|
||||||
@@ -195,10 +211,13 @@ public class AnonymizeStatistics {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// execute the processing callables
|
||||||
executorService.invokeAll(callables);
|
executorService.invokeAll(callables);
|
||||||
|
|
||||||
|
// Commit the main core
|
||||||
solrLoggerService.commit();
|
solrLoggerService.commit();
|
||||||
|
|
||||||
|
// Commit all relevant solr shards
|
||||||
for (String shard : shards) {
|
for (String shard : shards) {
|
||||||
solrLoggerService.commitShard(shard);
|
solrLoggerService.commitShard(shard);
|
||||||
}
|
}
|
||||||
@@ -218,6 +237,12 @@ public class AnonymizeStatistics {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the documents to anonymize.
|
||||||
|
* @return
|
||||||
|
* Non-anonymized documents, which are older than the time period configured by the
|
||||||
|
* 'anonymize_statistics.time_threshold' config (or 90 days, if not configured)
|
||||||
|
*/
|
||||||
private static QueryResponse getDocuments() throws SolrServerException, IOException {
|
private static QueryResponse getDocuments() throws SolrServerException, IOException {
|
||||||
|
|
||||||
if (sleep > 0) {
|
if (sleep > 0) {
|
||||||
@@ -231,13 +256,19 @@ public class AnonymizeStatistics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return solrLoggerService.query(
|
return solrLoggerService.query(
|
||||||
"ip:*",
|
"ip:*",
|
||||||
"time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK,
|
"time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK,
|
||||||
null, batchSize, -1, null, null, null, null, null, false, false, true
|
null, batchSize, -1, null, null, null, null,
|
||||||
|
null, false, -1, false, true
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Callable} implementation to process a solr document to be anonymized.
|
||||||
|
* It will return true if the anonymization succeeded.
|
||||||
|
*/
|
||||||
public static class DoProcessing implements Callable<Boolean> {
|
public static class DoProcessing implements Callable<Boolean> {
|
||||||
|
|
||||||
private final SolrDocument document;
|
private final SolrDocument document;
|
||||||
private final long updated;
|
private final long updated;
|
||||||
|
|
||||||
|
@@ -853,7 +853,11 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
|||||||
for (int i = 0; i < docsToUpdate.size(); i++) {
|
for (int i = 0; i < docsToUpdate.size(); i++) {
|
||||||
SolrInputDocument solrDocument = docsToUpdate.get(i);
|
SolrInputDocument solrDocument = docsToUpdate.get(i);
|
||||||
|
|
||||||
|
// Get the relevant shard client
|
||||||
|
// For a non-sharded core, the shard variable will reference the main core
|
||||||
HttpSolrClient shard = getSolrServer(solrDocument.getFieldValue("[shard]").toString());
|
HttpSolrClient shard = getSolrServer(solrDocument.getFieldValue("[shard]").toString());
|
||||||
|
|
||||||
|
// Delete the document from the shard client
|
||||||
shard.deleteByQuery("uid:" + solrDocument.getFieldValue("uid"));
|
shard.deleteByQuery("uid:" + solrDocument.getFieldValue("uid"));
|
||||||
|
|
||||||
// Now loop over our fieldname actions
|
// Now loop over our fieldname actions
|
||||||
@@ -883,9 +887,12 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// see https://stackoverflow.com/questions/26941260/normalizing-solr-records-for-sharding-version-issues
|
||||||
solrDocument.removeField("_version_");
|
solrDocument.removeField("_version_");
|
||||||
|
// this field will not work with a non-sharded core
|
||||||
solrDocument.removeField("[shard]");
|
solrDocument.removeField("[shard]");
|
||||||
|
|
||||||
|
// Add the updated document to the shard client
|
||||||
shard.add(solrDocument);
|
shard.add(solrDocument);
|
||||||
|
|
||||||
if (commit) {
|
if (commit) {
|
||||||
@@ -1044,9 +1051,9 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public QueryResponse query(String query, String filterQuery,
|
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
|
||||||
String facetField, int rows, int max, String dateType, String dateStart,
|
String dateStart, String dateEnd, List<String> facetQueries, String sort,
|
||||||
String dateEnd, List<String> facetQueries, String sort, boolean ascending, int facetMinCount)
|
boolean ascending, int facetMinCount)
|
||||||
throws SolrServerException, IOException {
|
throws SolrServerException, IOException {
|
||||||
|
|
||||||
return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort,
|
return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort,
|
||||||
@@ -1065,7 +1072,8 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea
|
|||||||
@Override
|
@Override
|
||||||
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
|
public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType,
|
||||||
String dateStart, String dateEnd, List<String> facetQueries, String sort,
|
String dateStart, String dateEnd, List<String> facetQueries, String sort,
|
||||||
boolean ascending, int facetMinCount, boolean defaultFilterQueries, boolean includeShardField)
|
boolean ascending, int facetMinCount, boolean defaultFilterQueries,
|
||||||
|
boolean includeShardField)
|
||||||
throws SolrServerException, IOException {
|
throws SolrServerException, IOException {
|
||||||
|
|
||||||
if (solr == null) {
|
if (solr == null) {
|
||||||
|
@@ -118,6 +118,19 @@ public interface SolrLoggerService {
|
|||||||
List<String> fieldNames, List<List<Object>> fieldValuesList)
|
List<String> fieldNames, List<List<Object>> fieldValuesList)
|
||||||
throws SolrServerException, IOException;
|
throws SolrServerException, IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the solr core.
|
||||||
|
* @param query
|
||||||
|
* query indicating which documents to update
|
||||||
|
* @param action
|
||||||
|
* the update action keyword
|
||||||
|
* @param fieldNames
|
||||||
|
* the fields to update
|
||||||
|
* @param fieldValuesList
|
||||||
|
* the values for the fields to update
|
||||||
|
* @param commit
|
||||||
|
* whether to commit the changes
|
||||||
|
*/
|
||||||
public void update(String query, String action,
|
public void update(String query, String action,
|
||||||
List<String> fieldNames, List<List<Object>> fieldValuesList, boolean commit)
|
List<String> fieldNames, List<List<Object>> fieldValuesList, boolean commit)
|
||||||
throws SolrServerException, IOException;
|
throws SolrServerException, IOException;
|
||||||
@@ -178,18 +191,84 @@ public interface SolrLoggerService {
|
|||||||
public ObjectCount queryTotal(String query, String filterQuery, int facetMinCount)
|
public ObjectCount queryTotal(String query, String filterQuery, int facetMinCount)
|
||||||
throws SolrServerException, IOException;
|
throws SolrServerException, IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform a solr query.
|
||||||
|
*
|
||||||
|
* @param query the query to be used
|
||||||
|
* @param filterQuery filter query
|
||||||
|
* @param facetField field to facet the results by
|
||||||
|
* @param rows the max number of results to return
|
||||||
|
* @param max the max number of facets to return
|
||||||
|
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
|
||||||
|
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
|
||||||
|
* relatively on today
|
||||||
|
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
|
||||||
|
* relatively on today
|
||||||
|
* @param facetQueries list of facet queries
|
||||||
|
* @param sort the sort field
|
||||||
|
* @param ascending the sort direction (true: ascending)
|
||||||
|
* @param facetMinCount Minimum count of results facet must have to return a result
|
||||||
|
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
|
||||||
|
* @throws java.io.IOException passed through.
|
||||||
|
*/
|
||||||
public QueryResponse query(String query, String filterQuery,
|
public QueryResponse query(String query, String filterQuery,
|
||||||
String facetField, int rows, int max, String dateType, String dateStart,
|
String facetField, int rows, int max, String dateType, String dateStart,
|
||||||
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
||||||
int facetMinCount)
|
int facetMinCount)
|
||||||
throws SolrServerException, IOException;
|
throws SolrServerException, IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform a solr query.
|
||||||
|
*
|
||||||
|
* @param query the query to be used
|
||||||
|
* @param filterQuery filter query
|
||||||
|
* @param facetField field to facet the results by
|
||||||
|
* @param rows the max number of results to return
|
||||||
|
* @param max the max number of facets to return
|
||||||
|
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
|
||||||
|
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
|
||||||
|
* relatively on today
|
||||||
|
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
|
||||||
|
* relatively on today
|
||||||
|
* @param facetQueries list of facet queries
|
||||||
|
* @param sort the sort field
|
||||||
|
* @param ascending the sort direction (true: ascending)
|
||||||
|
* @param facetMinCount Minimum count of results facet must have to return a result
|
||||||
|
* @param defaultFilterQueries
|
||||||
|
* use the default filter queries
|
||||||
|
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
|
||||||
|
* @throws java.io.IOException passed through.
|
||||||
|
*/
|
||||||
public QueryResponse query(String query, String filterQuery,
|
public QueryResponse query(String query, String filterQuery,
|
||||||
String facetField, int rows, int max, String dateType, String dateStart,
|
String facetField, int rows, int max, String dateType, String dateStart,
|
||||||
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
||||||
int facetMinCount, boolean defaultFilterQueries)
|
int facetMinCount, boolean defaultFilterQueries)
|
||||||
throws SolrServerException, IOException;
|
throws SolrServerException, IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform a solr query.
|
||||||
|
*
|
||||||
|
* @param query the query to be used
|
||||||
|
* @param filterQuery filter query
|
||||||
|
* @param facetField field to facet the results by
|
||||||
|
* @param rows the max number of results to return
|
||||||
|
* @param max the max number of facets to return
|
||||||
|
* @param dateType the type to be used (example: DAY, MONTH, YEAR)
|
||||||
|
* @param dateStart the start date Format:(-3, -2, ..) the date is calculated
|
||||||
|
* relatively on today
|
||||||
|
* @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated
|
||||||
|
* relatively on today
|
||||||
|
* @param facetQueries list of facet queries
|
||||||
|
* @param sort the sort field
|
||||||
|
* @param ascending the sort direction (true: ascending)
|
||||||
|
* @param facetMinCount Minimum count of results facet must have to return a result
|
||||||
|
* @param defaultFilterQueries
|
||||||
|
* use the default filter queries
|
||||||
|
* @param includeShardField
|
||||||
|
* include the shard field in the result documents
|
||||||
|
* @throws SolrServerException Exception from the Solr server to the solrj Java client.
|
||||||
|
* @throws java.io.IOException passed through.
|
||||||
|
*/
|
||||||
public QueryResponse query(String query, String filterQuery,
|
public QueryResponse query(String query, String filterQuery,
|
||||||
String facetField, int rows, int max, String dateType, String dateStart,
|
String facetField, int rows, int max, String dateType, String dateStart,
|
||||||
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
String dateEnd, List<String> facetQueries, String sort, boolean ascending,
|
||||||
@@ -221,10 +300,23 @@ public interface SolrLoggerService {
|
|||||||
*/
|
*/
|
||||||
public void exportHits() throws Exception;
|
public void exportHits() throws Exception;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Commit the solr core.
|
||||||
|
*/
|
||||||
public void commit() throws Exception;
|
public void commit() throws Exception;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Commit a solr shard.
|
||||||
|
* @param shard
|
||||||
|
* The shard to commit.
|
||||||
|
*/
|
||||||
public void commitShard(String shard) throws Exception;
|
public void commitShard(String shard) throws Exception;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Anonymize a given ip
|
||||||
|
* @param ip
|
||||||
|
* The ip to anonymize.
|
||||||
|
*/
|
||||||
public Object anonymizeIp(String ip) throws UnknownHostException;
|
public Object anonymizeIp(String ip) throws UnknownHostException;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user