diff --git a/dspace-api/src/main/java/org/dspace/statistics/AnonymizeStatistics.java b/dspace-api/src/main/java/org/dspace/statistics/AnonymizeStatistics.java index bb3cd35b40..edb8d9b52d 100644 --- a/dspace-api/src/main/java/org/dspace/statistics/AnonymizeStatistics.java +++ b/dspace-api/src/main/java/org/dspace/statistics/AnonymizeStatistics.java @@ -46,6 +46,16 @@ import org.dspace.services.factory.DSpaceServicesFactory; import org.dspace.statistics.factory.StatisticsServiceFactory; import org.dspace.statistics.service.SolrLoggerService; +/** + * Script to anonymize solr statistics according to GDPR specifications. + * This script will anonymize records older than a certain threshold, configurable with the + * 'anonymize_statistics.time_threshold' config, with a default value of 90 days. + * The records will be anonymized by replacing the last part of the ip address with a mask, this mask is configurable: + * For IPv4 addresses, the config is 'anonymize_statistics.ip_v4_mask', with a default value of '255' + * For IPv6 addresses, the config is 'anonymize_statistics.ip_v6_mask', with a default value of 'FFFF:FFFF' + * The DNS value of the records will also be replaced by a mask, configurable with 'anonymize_statistics.dns_mask', + * and with a default value of 'anonymized'. + */ public class AnonymizeStatistics { private static Logger log = getLogger(AnonymizeStatistics.class); @@ -169,19 +179,25 @@ public class AnonymizeStatistics { } + /** + * Anonymize the relevant solr documents, returned by the getDocuments method. + */ private static void anonymizeStatistics() { try { long updated = 0; long total = getDocuments().getResults().getNumFound(); printInfo(total + " documents to update"); + // The documents will be processed in seperate threads. ExecutorService executorService = Executors.newFixedThreadPool(threads); QueryResponse documents; do { documents = getDocuments(); - Collection> callables = new ArrayList<>(); + // list of the processing callables to execute + Collection callables = new ArrayList<>(); + // list of the shards to commit Set shards = new HashSet<>(); for (SolrDocument document : documents.getResults()) { @@ -195,10 +211,13 @@ public class AnonymizeStatistics { } } + // execute the processing callables executorService.invokeAll(callables); + // Commit the main core solrLoggerService.commit(); + // Commit all relevant solr shards for (String shard : shards) { solrLoggerService.commitShard(shard); } @@ -218,6 +237,12 @@ public class AnonymizeStatistics { } } + /** + * Get the documents to anonymize. + * @return + * Non-anonymized documents, which are older than the time period configured by the + * 'anonymize_statistics.time_threshold' config (or 90 days, if not configured) + */ private static QueryResponse getDocuments() throws SolrServerException, IOException { if (sleep > 0) { @@ -231,13 +256,19 @@ public class AnonymizeStatistics { } return solrLoggerService.query( - "ip:*", - "time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK, - null, batchSize, -1, null, null, null, null, null, false, false, true + "ip:*", + "time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK, + null, batchSize, -1, null, null, null, null, + null, false, -1, false, true ); } + /** + * {@link Callable} implementation to process a solr document to be anonymized. + * It will return true if the anonymization succeeded. + */ public static class DoProcessing implements Callable { + private final SolrDocument document; private final long updated; diff --git a/dspace-api/src/main/java/org/dspace/statistics/SolrLoggerServiceImpl.java b/dspace-api/src/main/java/org/dspace/statistics/SolrLoggerServiceImpl.java index 27f2ce3720..57006e81c2 100644 --- a/dspace-api/src/main/java/org/dspace/statistics/SolrLoggerServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/statistics/SolrLoggerServiceImpl.java @@ -853,7 +853,11 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea for (int i = 0; i < docsToUpdate.size(); i++) { SolrInputDocument solrDocument = docsToUpdate.get(i); + // Get the relevant shard client + // For a non-sharded core, the shard variable will reference the main core HttpSolrClient shard = getSolrServer(solrDocument.getFieldValue("[shard]").toString()); + + // Delete the document from the shard client shard.deleteByQuery("uid:" + solrDocument.getFieldValue("uid")); // Now loop over our fieldname actions @@ -883,9 +887,12 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea } } + // see https://stackoverflow.com/questions/26941260/normalizing-solr-records-for-sharding-version-issues solrDocument.removeField("_version_"); + // this field will not work with a non-sharded core solrDocument.removeField("[shard]"); + // Add the updated document to the shard client shard.add(solrDocument); if (commit) { @@ -1044,9 +1051,9 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea } @Override - public QueryResponse query(String query, String filterQuery, - String facetField, int rows, int max, String dateType, String dateStart, - String dateEnd, List facetQueries, String sort, boolean ascending, int facetMinCount) + public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType, + String dateStart, String dateEnd, List facetQueries, String sort, + boolean ascending, int facetMinCount) throws SolrServerException, IOException { return query(query, filterQuery, facetField, rows, max, dateType, dateStart, dateEnd, facetQueries, sort, @@ -1065,7 +1072,8 @@ public class SolrLoggerServiceImpl implements SolrLoggerService, InitializingBea @Override public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType, String dateStart, String dateEnd, List facetQueries, String sort, - boolean ascending, int facetMinCount, boolean defaultFilterQueries, boolean includeShardField) + boolean ascending, int facetMinCount, boolean defaultFilterQueries, + boolean includeShardField) throws SolrServerException, IOException { if (solr == null) { diff --git a/dspace-api/src/main/java/org/dspace/statistics/service/SolrLoggerService.java b/dspace-api/src/main/java/org/dspace/statistics/service/SolrLoggerService.java index 9d28fad1ab..c32664bdeb 100644 --- a/dspace-api/src/main/java/org/dspace/statistics/service/SolrLoggerService.java +++ b/dspace-api/src/main/java/org/dspace/statistics/service/SolrLoggerService.java @@ -118,6 +118,19 @@ public interface SolrLoggerService { List fieldNames, List> fieldValuesList) throws SolrServerException, IOException; + /** + * Update the solr core. + * @param query + * query indicating which documents to update + * @param action + * the update action keyword + * @param fieldNames + * the fields to update + * @param fieldValuesList + * the values for the fields to update + * @param commit + * whether to commit the changes + */ public void update(String query, String action, List fieldNames, List> fieldValuesList, boolean commit) throws SolrServerException, IOException; @@ -178,18 +191,84 @@ public interface SolrLoggerService { public ObjectCount queryTotal(String query, String filterQuery, int facetMinCount) throws SolrServerException, IOException; + /** + * Perform a solr query. + * + * @param query the query to be used + * @param filterQuery filter query + * @param facetField field to facet the results by + * @param rows the max number of results to return + * @param max the max number of facets to return + * @param dateType the type to be used (example: DAY, MONTH, YEAR) + * @param dateStart the start date Format:(-3, -2, ..) the date is calculated + * relatively on today + * @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated + * relatively on today + * @param facetQueries list of facet queries + * @param sort the sort field + * @param ascending the sort direction (true: ascending) + * @param facetMinCount Minimum count of results facet must have to return a result + * @throws SolrServerException Exception from the Solr server to the solrj Java client. + * @throws java.io.IOException passed through. + */ public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType, String dateStart, String dateEnd, List facetQueries, String sort, boolean ascending, int facetMinCount) throws SolrServerException, IOException; + /** + * Perform a solr query. + * + * @param query the query to be used + * @param filterQuery filter query + * @param facetField field to facet the results by + * @param rows the max number of results to return + * @param max the max number of facets to return + * @param dateType the type to be used (example: DAY, MONTH, YEAR) + * @param dateStart the start date Format:(-3, -2, ..) the date is calculated + * relatively on today + * @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated + * relatively on today + * @param facetQueries list of facet queries + * @param sort the sort field + * @param ascending the sort direction (true: ascending) + * @param facetMinCount Minimum count of results facet must have to return a result + * @param defaultFilterQueries + * use the default filter queries + * @throws SolrServerException Exception from the Solr server to the solrj Java client. + * @throws java.io.IOException passed through. + */ public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType, String dateStart, String dateEnd, List facetQueries, String sort, boolean ascending, int facetMinCount, boolean defaultFilterQueries) throws SolrServerException, IOException; + /** + * Perform a solr query. + * + * @param query the query to be used + * @param filterQuery filter query + * @param facetField field to facet the results by + * @param rows the max number of results to return + * @param max the max number of facets to return + * @param dateType the type to be used (example: DAY, MONTH, YEAR) + * @param dateStart the start date Format:(-3, -2, ..) the date is calculated + * relatively on today + * @param dateEnd the end date stop Format (-2, +1, ..) the date is calculated + * relatively on today + * @param facetQueries list of facet queries + * @param sort the sort field + * @param ascending the sort direction (true: ascending) + * @param facetMinCount Minimum count of results facet must have to return a result + * @param defaultFilterQueries + * use the default filter queries + * @param includeShardField + * include the shard field in the result documents + * @throws SolrServerException Exception from the Solr server to the solrj Java client. + * @throws java.io.IOException passed through. + */ public QueryResponse query(String query, String filterQuery, String facetField, int rows, int max, String dateType, String dateStart, String dateEnd, List facetQueries, String sort, boolean ascending, @@ -221,10 +300,23 @@ public interface SolrLoggerService { */ public void exportHits() throws Exception; + /** + * Commit the solr core. + */ public void commit() throws Exception; + /** + * Commit a solr shard. + * @param shard + * The shard to commit. + */ public void commitShard(String shard) throws Exception; + /** + * Anonymize a given ip + * @param ip + * The ip to anonymize. + */ public Object anonymizeIp(String ip) throws UnknownHostException; }