90830: Issue 8125: Import items stale in discovery

This commit is contained in:
Kristof De Langhe
2022-04-20 13:21:35 +02:00
parent 0ae13fe0fb
commit 80158aadfe
5 changed files with 88 additions and 33 deletions

View File

@@ -1,3 +1,10 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.itemdbstatus; package org.dspace.app.itemdbstatus;
import static org.dspace.discovery.indexobject.ItemIndexFactoryImpl.STATUS_FIELD; import static org.dspace.discovery.indexobject.ItemIndexFactoryImpl.STATUS_FIELD;
@@ -5,8 +12,8 @@ import static org.dspace.discovery.indexobject.ItemIndexFactoryImpl.STATUS_FIELD
import java.io.IOException; import java.io.IOException;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.Calendar;
import java.util.Optional; import java.util.Optional;
import java.util.UUID;
import org.apache.commons.cli.ParseException; import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@@ -14,36 +21,38 @@ import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.ItemService;
import org.dspace.core.Context; import org.dspace.core.Context;
import org.dspace.discovery.DiscoverQuery;
import org.dspace.discovery.DiscoverResult;
import org.dspace.discovery.IndexableObject; import org.dspace.discovery.IndexableObject;
import org.dspace.discovery.IndexingService; import org.dspace.discovery.IndexingService;
import org.dspace.discovery.SearchService;
import org.dspace.discovery.SearchServiceException; import org.dspace.discovery.SearchServiceException;
import org.dspace.discovery.SearchUtils; import org.dspace.discovery.SearchUtils;
import org.dspace.discovery.SolrSearchCore; import org.dspace.discovery.SolrSearchCore;
import org.dspace.discovery.indexobject.IndexableItem; import org.dspace.discovery.indexobject.IndexableItem;
import org.dspace.discovery.indexobject.factory.IndexObjectFactoryFactory; import org.dspace.discovery.indexobject.factory.IndexObjectFactoryFactory;
import org.dspace.scripts.DSpaceRunnable; import org.dspace.scripts.DSpaceRunnable;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory; import org.dspace.services.factory.DSpaceServicesFactory;
import org.dspace.util.SolrUtils;
import org.dspace.utils.DSpace; import org.dspace.utils.DSpace;
/** /**
* Created by kristof on 19/04/2022 * {@link DSpaceRunnable} implementation to update solr items with "predb" status to either:
* - Delete them from solr if they're not present in the database
* - Remove their status if they're present in the database
*/ */
public class ItemDatabaseStatusCli extends DSpaceRunnable<ItemDatabaseStatusCliScriptConfiguration> { public class ItemDatabaseStatusCli extends DSpaceRunnable<ItemDatabaseStatusCliScriptConfiguration> {
/* Log4j logger */ /* Log4j logger */
private static final Logger log = Logger.getLogger(ItemDatabaseStatusCli.class); private static final Logger log = Logger.getLogger(ItemDatabaseStatusCli.class);
private SearchService searchService; public static final String TIME_UNTIL_REINDEX_PROPERTY = "item-database-status.time-until-reindex";
private ItemService itemService;
private IndexingService indexingService; private IndexingService indexingService;
private SolrSearchCore solrSearchCore; private SolrSearchCore solrSearchCore;
private IndexObjectFactoryFactory indexObjectServiceFactory; private IndexObjectFactoryFactory indexObjectServiceFactory;
private ConfigurationService configurationService;
private int timeUntilReindex = 0;
private String maxTime;
@Override @Override
public ItemDatabaseStatusCliScriptConfiguration getScriptConfiguration() { public ItemDatabaseStatusCliScriptConfiguration getScriptConfiguration() {
@@ -53,18 +62,20 @@ public class ItemDatabaseStatusCli extends DSpaceRunnable<ItemDatabaseStatusCliS
@Override @Override
public void setup() throws ParseException { public void setup() throws ParseException {
searchService = SearchUtils.getSearchService();
itemService = ContentServiceFactory.getInstance().getItemService();
indexingService = DSpaceServicesFactory.getInstance().getServiceManager() indexingService = DSpaceServicesFactory.getInstance().getServiceManager()
.getServiceByName(IndexingService.class.getName(), IndexingService.class); .getServiceByName(IndexingService.class.getName(), IndexingService.class);
solrSearchCore = DSpaceServicesFactory.getInstance().getServiceManager() solrSearchCore = DSpaceServicesFactory.getInstance().getServiceManager()
.getServiceByName(SolrSearchCore.class.getName(), SolrSearchCore.class); .getServiceByName(SolrSearchCore.class.getName(), SolrSearchCore.class);
indexObjectServiceFactory = IndexObjectFactoryFactory.getInstance(); indexObjectServiceFactory = IndexObjectFactoryFactory.getInstance();
configurationService = DSpaceServicesFactory.getInstance().getConfigurationService();
} }
@Override @Override
public void internalRun() throws Exception { public void internalRun() throws Exception {
logAndOut("Starting Item Database Status update..."); logInfoAndOut("Starting Item Database Status update...");
timeUntilReindex = getTimeUntilReindex();
maxTime = getMaxTime();
Context context = new Context(); Context context = new Context();
@@ -81,15 +92,20 @@ public class ItemDatabaseStatusCli extends DSpaceRunnable<ItemDatabaseStatusCliS
SolrQuery solrQuery = new SolrQuery(); SolrQuery solrQuery = new SolrQuery();
solrQuery.setQuery(STATUS_FIELD + ":" + STATUS_FIELD_PREDB); solrQuery.setQuery(STATUS_FIELD + ":" + STATUS_FIELD_PREDB);
solrQuery.addFilterQuery(SearchUtils.RESOURCE_TYPE_FIELD + ":" + IndexableItem.TYPE); solrQuery.addFilterQuery(SearchUtils.RESOURCE_TYPE_FIELD + ":" + IndexableItem.TYPE);
String dateRangeFilter = SearchUtils.LAST_INDEXED_FIELD + ":[* TO " + maxTime + "]";
logDebugAndOut("Date range filter used; " + dateRangeFilter);
solrQuery.addFilterQuery(dateRangeFilter);
solrQuery.addField(SearchUtils.RESOURCE_ID_FIELD); solrQuery.addField(SearchUtils.RESOURCE_ID_FIELD);
solrQuery.addField(SearchUtils.RESOURCE_UNIQUE_ID); solrQuery.addField(SearchUtils.RESOURCE_UNIQUE_ID);
QueryResponse response = solrSearchCore.getSolr().query(solrQuery, solrSearchCore.REQUEST_METHOD); QueryResponse response = solrSearchCore.getSolr().query(solrQuery, solrSearchCore.REQUEST_METHOD);
if (response != null) { if (response != null) {
logInfoAndOut(response.getResults().size() + " items found to process");
for (SolrDocument doc : response.getResults()) { for (SolrDocument doc : response.getResults()) {
String uuid = (String) doc.getFirstValue(SearchUtils.RESOURCE_ID_FIELD); String uuid = (String) doc.getFirstValue(SearchUtils.RESOURCE_ID_FIELD);
String uniqueId = (String) doc.getFirstValue(SearchUtils.RESOURCE_UNIQUE_ID); String uniqueId = (String) doc.getFirstValue(SearchUtils.RESOURCE_UNIQUE_ID);
logAndOut("Processing item with UUID: " + uuid); logDebugAndOut("Processing item with UUID: " + uuid);
Optional<IndexableObject> indexableObject = Optional.empty(); Optional<IndexableObject> indexableObject = Optional.empty();
try { try {
@@ -102,10 +118,10 @@ public class ItemDatabaseStatusCli extends DSpaceRunnable<ItemDatabaseStatusCliS
try { try {
if (indexableObject.isPresent()) { if (indexableObject.isPresent()) {
logAndOut("Item exists in DB, updating solr document"); logDebugAndOut("Item exists in DB, updating solr document");
updateItem(context, indexableObject.get()); updateItem(context, indexableObject.get());
} else { } else {
logAndOut("Item doesn't exist in DB, removing solr document"); logDebugAndOut("Item doesn't exist in DB, removing solr document");
removeItem(context, uniqueId); removeItem(context, uniqueId);
} }
} catch (SQLException | IOException e) { } catch (SQLException | IOException e) {
@@ -125,8 +141,25 @@ public class ItemDatabaseStatusCli extends DSpaceRunnable<ItemDatabaseStatusCliS
indexingService.unIndexContent(context, uniqueId); indexingService.unIndexContent(context, uniqueId);
} }
private void logAndOut(String message) { private String getMaxTime() {
Calendar cal = Calendar.getInstance();
if (timeUntilReindex > 0) {
cal.add(Calendar.MILLISECOND, -timeUntilReindex);
}
return SolrUtils.getDateFormatter().format(cal.getTime());
}
private int getTimeUntilReindex() {
return configurationService.getIntProperty(TIME_UNTIL_REINDEX_PROPERTY, 0);
}
private void logInfoAndOut(String message) {
log.info(message); log.info(message);
System.out.println(message); System.out.println(message);
} }
private void logDebugAndOut(String message) {
log.debug(message);
System.out.println(message);
}
} }

View File

@@ -1,3 +1,10 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.itemdbstatus; package org.dspace.app.itemdbstatus;
import org.apache.commons.cli.Options; import org.apache.commons.cli.Options;
@@ -5,7 +12,7 @@ import org.dspace.core.Context;
import org.dspace.scripts.configuration.ScriptConfiguration; import org.dspace.scripts.configuration.ScriptConfiguration;
/** /**
* Created by kristof on 19/04/2022 * The {@link ScriptConfiguration} for the {@link ItemDatabaseStatusCli} script.
*/ */
public class ItemDatabaseStatusCliScriptConfiguration extends ScriptConfiguration<ItemDatabaseStatusCli> { public class ItemDatabaseStatusCliScriptConfiguration extends ScriptConfiguration<ItemDatabaseStatusCli> {
private Class<ItemDatabaseStatusCli> dspaceRunnableClass; private Class<ItemDatabaseStatusCli> dspaceRunnableClass;

View File

@@ -15,6 +15,8 @@ import java.util.Set;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.dspace.content.Bundle; import org.dspace.content.Bundle;
import org.dspace.content.DSpaceObject; import org.dspace.content.DSpaceObject;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.ItemService;
import org.dspace.core.Constants; import org.dspace.core.Constants;
import org.dspace.core.Context; import org.dspace.core.Context;
import org.dspace.discovery.indexobject.factory.IndexFactory; import org.dspace.discovery.indexobject.factory.IndexFactory;
@@ -38,7 +40,7 @@ public class IndexEventConsumer implements Consumer {
// collect Items, Collections, Communities that need indexing // collect Items, Collections, Communities that need indexing
private Set<IndexableObject> objectsToUpdate = new HashSet<>(); private Set<IndexableObject> objectsToUpdate = new HashSet<>();
// collect freshly created Items that need indexing (requires pre-db status) // collect freshly created Items that need indexing and require pre-db status
private Set<IndexableObject> createdItemsToUpdate = new HashSet<>(); private Set<IndexableObject> createdItemsToUpdate = new HashSet<>();
// unique search IDs to delete // unique search IDs to delete
@@ -50,6 +52,8 @@ public class IndexEventConsumer implements Consumer {
IndexObjectFactoryFactory indexObjectServiceFactory = IndexObjectFactoryFactory.getInstance(); IndexObjectFactoryFactory indexObjectServiceFactory = IndexObjectFactoryFactory.getInstance();
ItemService itemService = ContentServiceFactory.getInstance().getItemService();
@Override @Override
public void initialize() throws Exception { public void initialize() throws Exception {
@@ -147,11 +151,8 @@ public class IndexEventConsumer implements Consumer {
String detail = indexableObjectService.getType() + "-" + event.getSubjectID().toString(); String detail = indexableObjectService.getType() + "-" + event.getSubjectID().toString();
uniqueIdsToDelete.add(detail); uniqueIdsToDelete.add(detail);
} }
if (st == Constants.ITEM && et == Event.CREATE && object == null) {
createdItemsToUpdate.addAll(indexObjectServiceFactory.getIndexableObjects(ctx, subject)); objectsToUpdate.addAll(indexObjectServiceFactory.getIndexableObjects(ctx, subject));
} else {
objectsToUpdate.addAll(indexObjectServiceFactory.getIndexableObjects(ctx, subject));
}
} }
break; break;
@@ -170,7 +171,13 @@ public class IndexEventConsumer implements Consumer {
// also update the object in order to index mapped/unmapped Items // also update the object in order to index mapped/unmapped Items
if (subject != null && if (subject != null &&
subject.getType() == Constants.COLLECTION && object.getType() == Constants.ITEM) { subject.getType() == Constants.COLLECTION && object.getType() == Constants.ITEM) {
createdItemsToUpdate.addAll(indexObjectServiceFactory.getIndexableObjects(ctx, object)); // If the item doesn't exist in the database yet, add it to createdItemsToUpdate
// Otherwise use the standard objectsToUpdate
if (itemService.find(ctx, object.getID()) == null) {
createdItemsToUpdate.addAll(indexObjectServiceFactory.getIndexableObjects(ctx, object));
} else {
objectsToUpdate.addAll(indexObjectServiceFactory.getIndexableObjects(ctx, object));
}
} }
} }
break; break;

View File

@@ -759,6 +759,7 @@ public class SolrServiceImpl implements SearchService, IndexingService {
solrQuery.addField(SearchUtils.RESOURCE_TYPE_FIELD); solrQuery.addField(SearchUtils.RESOURCE_TYPE_FIELD);
solrQuery.addField(SearchUtils.RESOURCE_ID_FIELD); solrQuery.addField(SearchUtils.RESOURCE_ID_FIELD);
solrQuery.addField(SearchUtils.RESOURCE_UNIQUE_ID); solrQuery.addField(SearchUtils.RESOURCE_UNIQUE_ID);
solrQuery.addField(STATUS_FIELD);
if (discoveryQuery.isSpellCheck()) { if (discoveryQuery.isSpellCheck()) {
solrQuery.setParam(SpellingParams.SPELLCHECK_Q, query); solrQuery.setParam(SpellingParams.SPELLCHECK_Q, query);
@@ -766,9 +767,6 @@ public class SolrServiceImpl implements SearchService, IndexingService {
solrQuery.setParam("spellcheck", Boolean.TRUE); solrQuery.setParam("spellcheck", Boolean.TRUE);
} }
// Exclude items with status:predb to avoid solr docs being removed during large imports (Issue #8125)
solrQuery.addFilterQuery("!" + STATUS_FIELD + ":" + STATUS_FIELD_PREDB);
for (int i = 0; i < discoveryQuery.getFilterQueries().size(); i++) { for (int i = 0; i < discoveryQuery.getFilterQueries().size(); i++) {
String filterQuery = discoveryQuery.getFilterQueries().get(i); String filterQuery = discoveryQuery.getFilterQueries().get(i);
solrQuery.addFilterQuery(filterQuery); solrQuery.addFilterQuery(filterQuery);
@@ -912,11 +910,14 @@ public class SolrServiceImpl implements SearchService, IndexingService {
// Enables solr to remove documents related to items not on database anymore (Stale) // Enables solr to remove documents related to items not on database anymore (Stale)
// if maxAttemps is greater than 0 cleanup the index on each step // if maxAttemps is greater than 0 cleanup the index on each step
if (maxAttempts >= 0) { if (maxAttempts >= 0) {
zombieDocs.add((String) doc.getFirstValue(SearchUtils.RESOURCE_UNIQUE_ID)); Object statusObj = doc.getFirstValue(STATUS_FIELD);
// avoid to process the response except if we are in the last allowed execution. if (!(statusObj instanceof String && statusObj.equals(STATUS_FIELD_PREDB))) {
// When maxAttempts is 0 this will be just the first and last run as the zombieDocs.add((String) doc.getFirstValue(SearchUtils.RESOURCE_UNIQUE_ID));
// executionCount is increased at the start of the loop it will be equals to 1 // avoid to process the response except if we are in the last allowed execution.
skipLoadingResponse = maxAttempts + 1 != executionCount; // When maxAttempts is 0 this will be just the first and last run as the
// executionCount is increased at the start of the loop it will be equals to 1
skipLoadingResponse = maxAttempts + 1 != executionCount;
}
} }
continue; continue;
} }

View File

@@ -1533,6 +1533,13 @@ mail.helpdesk.name = Help Desk
request.item.helpdesk.override = false request.item.helpdesk.override = false
#---------------------------------------------------------------#
#----------ITEM DATABASE STATUS SCRIPT CONFIGURATION------------#
#---------------------------------------------------------------#
# The max amount of time allowed for an item to be present in solr with predb status without needing a reindex (in ms)
item-database-status.time-until-reindex = 600000
#------------------------------------------------------------------# #------------------------------------------------------------------#
#-------------------MODULE CONFIGURATIONS--------------------------# #-------------------MODULE CONFIGURATIONS--------------------------#
#------------------------------------------------------------------# #------------------------------------------------------------------#