/** * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at * * http://www.dspace.org/license/ */ package org.dspace.statistics; import static java.lang.Integer.parseInt; import static java.lang.Thread.currentThread; import static java.lang.Thread.sleep; import static java.util.Arrays.asList; import static java.util.Calendar.DAY_OF_YEAR; import static java.util.Collections.singletonList; import static org.apache.commons.cli.Option.builder; import static org.apache.commons.lang.time.DateFormatUtils.format; import static org.apache.logging.log4j.LogManager.getLogger; import static org.dspace.core.LogHelper.getHeader; import static org.dspace.statistics.SolrLoggerServiceImpl.DATE_FORMAT_8601; import java.io.IOException; import java.util.ArrayList; import java.util.Calendar; import java.util.Collection; import java.util.Date; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.logging.log4j.Logger; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.dspace.core.Context; import org.dspace.services.ConfigurationService; import org.dspace.services.factory.DSpaceServicesFactory; import org.dspace.statistics.factory.StatisticsServiceFactory; import org.dspace.statistics.service.SolrLoggerService; /** * Script to anonymize solr statistics according to GDPR specifications. * This script will anonymize records older than a certain threshold, configurable with the * 'anonymize_statistics.time_threshold' config, with a default value of 90 days. * The records will be anonymized by replacing the last part of the ip address with a mask, this mask is configurable: * For IPv4 addresses, the config is 'anonymize_statistics.ip_v4_mask', with a default value of '255' * For IPv6 addresses, the config is 'anonymize_statistics.ip_v6_mask', with a default value of 'FFFF:FFFF' * The DNS value of the records will also be replaced by a mask, configurable with 'anonymize_statistics.dns_mask', * and with a default value of 'anonymized'. */ public class AnonymizeStatistics { private static final Logger log = getLogger(); private static final Context context = new Context(); private static final String action = "anonymize_statistics"; private static final String HELP_OPTION = "h"; private static final String SLEEP_OPTION = "s"; private static final String BATCH_SIZE_OPTION = "b"; private static final String THREADS_OPTION = "t"; private static int sleep; private static final SolrLoggerService solrLoggerService = StatisticsServiceFactory.getInstance().getSolrLoggerService(); private static final ConfigurationService configurationService = DSpaceServicesFactory.getInstance().getConfigurationService(); private static int batchSize = 100; private static int threads = 2; private static final Object DNS_MASK = configurationService.getProperty("anonymize_statistics.dns_mask", "anonymized"); private static final String TIME_LIMIT; static { Calendar calendar = Calendar.getInstance(); calendar.add(DAY_OF_YEAR, -configurationService.getIntProperty("anonymize_statistics.time_threshold", 90)); TIME_LIMIT = format(calendar, DATE_FORMAT_8601); } private AnonymizeStatistics() { } public static void main(String... args) throws ParseException { parseCommandLineOptions(createCommandLineOptions(), args); anonymizeStatistics(); } private static Options createCommandLineOptions() { Options options = new Options(); options.addOption( builder(HELP_OPTION) .longOpt("help") .desc("Print the usage of the script") .hasArg(false) .build() ); options.addOption( builder(SLEEP_OPTION) .longOpt("sleep") .desc("Sleep a certain time given in milliseconds between each solr request") .hasArg(true) .build() ); options.addOption( builder(BATCH_SIZE_OPTION) .longOpt("batch") .desc("The amount of Solr records to be processed per batch (defaults to 100)") .hasArg(true) .build() ); options.addOption( builder(THREADS_OPTION) .longOpt("threads") .desc("The amount of threads used by the script (defaults to 2") .hasArg(true) .build() ); return options; } private static void parseCommandLineOptions(Options options, String... args) throws ParseException { CommandLine commandLine = new DefaultParser().parse(options, args); if (commandLine.hasOption(HELP_OPTION)) { printHelp(options); System.exit(-1); } if (commandLine.hasOption(SLEEP_OPTION)) { sleep = parseInt(commandLine.getOptionValue(SLEEP_OPTION)); } if (commandLine.hasOption(BATCH_SIZE_OPTION)) { batchSize = parseInt(commandLine.getOptionValue(BATCH_SIZE_OPTION)); } if (commandLine.hasOption(THREADS_OPTION)) { threads = parseInt(commandLine.getOptionValue(THREADS_OPTION)); } } private static void printHelp(Options options) { new HelpFormatter().printHelp("dsrun " + AnonymizeStatistics.class.getCanonicalName(), options); } private static void printInfo(String info) { System.out.println(info); log.info(getHeader(context, action, info)); } private static void printWarning(String warning) { System.out.println(warning); log.warn(getHeader(context, action, warning)); } private static void printError(Exception error) { error.printStackTrace(); log.error(getHeader(context, action, error.getMessage()), error); } /** * Anonymize the relevant solr documents, returned by the getDocuments method. */ private static void anonymizeStatistics() { try { long updated = 0; long total = getDocuments().getResults().getNumFound(); printInfo(total + " documents to update"); // The documents will be processed in seperate threads. ExecutorService executorService = Executors.newFixedThreadPool(threads); QueryResponse documents; do { documents = getDocuments(); // list of the processing callables to execute Collection callables = new ArrayList<>(); for (SolrDocument document : documents.getResults()) { updated++; callables.add(new DoProcessing(document, updated)); } // execute the processing callables executorService.invokeAll(callables); // Commit the solr core solrLoggerService.commit(); printInfo("processed " + updated + " records"); } while (documents.getResults().getNumFound() > 0); printInfo(updated + " documents updated"); if (updated == total) { printInfo("all relevant documents were updated"); } else { printWarning("not all relevant documents were updated, check the DSpace logs for more details"); } } catch (Exception e) { printError(e); } } /** * Get the documents to anonymize. * @return * Non-anonymized documents, which are older than the time period configured by the * 'anonymize_statistics.time_threshold' config (or 90 days, if not configured) */ private static QueryResponse getDocuments() throws SolrServerException, IOException { if (sleep > 0) { try { printInfo("sleep " + sleep + "ms"); sleep(sleep); } catch (InterruptedException e) { printError(e); currentThread().interrupt(); } } return solrLoggerService.query( "ip:*", "time:[* TO " + TIME_LIMIT + "] AND -dns:" + DNS_MASK, null, batchSize, -1, null, null, null, null, null, false, -1, false ); } /** * {@link Callable} implementation to process a solr document to be anonymized. * It will return true if the anonymization succeeded. */ public static class DoProcessing implements Callable { private final SolrDocument document; private final long updated; public DoProcessing(SolrDocument document, long updated) { this.document = document; this.updated = updated; } @Override public Boolean call() { try { solrLoggerService.update( "uid:" + document.getFieldValue("uid"), "replace", asList( "ip", "dns" ), asList( singletonList(solrLoggerService.anonymizeIp(document.getFieldValue("ip").toString())), singletonList(DNS_MASK) ), false ); printInfo(updated + ": updated document with uid " + document.getFieldValue("uid") + " " + new Date()); return true; } catch (Exception e) { printError(e); return false; } } } }