Files
DSpace/dspace-api/src/test/java/org/dspace/content/DuplicateDetectionTest.java
2024-03-04 11:40:38 +13:00

275 lines
12 KiB
Java

/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.content;
import static junit.framework.TestCase.assertEquals;
import static junit.framework.TestCase.assertNull;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.fail;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.dspace.AbstractIntegrationTestWithDatabase;
import org.dspace.builder.CollectionBuilder;
import org.dspace.builder.CommunityBuilder;
import org.dspace.builder.ItemBuilder;
import org.dspace.builder.WorkflowItemBuilder;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.DuplicateDetectionService;
import org.dspace.content.virtual.PotentialDuplicate;
import org.dspace.discovery.SearchServiceException;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
import org.dspace.xmlworkflow.storedcomponents.XmlWorkflowItem;
import org.junit.Before;
import org.junit.Test;
/**
*
* Integration tests for the duplicate detection service
*
* @author Kim Shepherd
*/
public class DuplicateDetectionTest extends AbstractIntegrationTestWithDatabase {
private DuplicateDetectionService duplicateDetectionService = ContentServiceFactory.getInstance()
.getDuplicateDetectionService();
private ConfigurationService configurationService = DSpaceServicesFactory.getInstance().getConfigurationService();
private Collection col;
private Collection workflowCol;
private Item item1;
private Item item2;
private Item item3;
private final String item1IssueDate = "2011-10-17";
private final String item1Subject = "ExtraEntry 1";
private final String item1Title = "Public item I";
private final String item1Author = "Smith, Donald";
private static final Logger log = LogManager.getLogger();
@Before
public void setUp() throws Exception {
super.setUp();
// Temporarily enable duplicate detection and set signature distance to 1
configurationService.setProperty("duplicate.enable", true);
configurationService.setProperty("duplicate.signature.distance", 1);
configurationService.setProperty("duplicate.signature.normalise.lowercase", true);
configurationService.setProperty("duplicate.signature.normalise.whitespace", true);
configurationService.setProperty("duplicate.signature.field", "item_signature");
configurationService.setProperty("duplicate.preview.metadata.field",
new String[]{"dc.date.issued", "dc.subject"});
context.turnOffAuthorisationSystem();
context.setDispatcher("default");
parentCommunity = CommunityBuilder.createCommunity(context).withName("Parent Community").build();
col = CollectionBuilder.createCollection(context, parentCommunity).withName("Collection").build();
workflowCol = CollectionBuilder.createCollection(context, parentCommunity)
.withName("Workflow Collection")
.withWorkflowGroup("reviewer", admin)
.build();
// Ingest three example items with slightly different titles
// item2 is 1 edit distance from item1 and item3
// item1 and item3 are 2 edit distance from each other
item1 = ItemBuilder.createItem(context, col)
.withTitle(item1Title) // Public item I
.withIssueDate(item1IssueDate)
.withAuthor(item1Author)
.withSubject(item1Subject)
.build();
item2 = ItemBuilder.createItem(context, col)
.withTitle("Public item II")
.withIssueDate("2012-10-17")
.withAuthor("Smith, Donald X.")
.withSubject("ExtraEntry 2")
.build();
item3 = ItemBuilder.createItem(context, col)
.withTitle("Public item III")
.withIssueDate("2013-10-17")
.withAuthor("Smith, Donald Y.")
.withSubject("ExtraEntry 3")
.build();
}
/**
* Test instantiation of simple potential duplicate object
*/
@Test
public void testPotentialDuplicateInstantatation() {
PotentialDuplicate potentialDuplicate = new PotentialDuplicate();
// The constructor should instantiate a new list for metadata
assertEquals("Metadata value list size should be 0",
0, potentialDuplicate.getMetadataValueList().size());
// Other properties should not be set
assertNull("Title should be null", potentialDuplicate.getTitle());
//StringUtils.getLevenshteinDistance()
}
/**
* Test instantiation of simple potential duplicate object given an item as a constructor argument
*/
@Test
public void testPotentialDuplicateInstantiationWithItem() {
PotentialDuplicate potentialDuplicate = new PotentialDuplicate(item1);
// We should have title, uuid, owning collection name set and metadata value list instantiated to empty
assertEquals("UUID should match item1 uuid", item1.getID(), potentialDuplicate.getUuid());
assertEquals("Title should match item1 title", item1Title, potentialDuplicate.getTitle());
assertEquals("Owning collection should match item1 owning collection",
item1.getOwningCollection().getName(), potentialDuplicate.getOwningCollectionName());
assertEquals("Metadata value list size should be 0",
0, potentialDuplicate.getMetadataValueList().size());
}
/**
* Test that a search for getPotentialDuplicates returns the expected results, populated with the expected
* preview values and metadata. This is the core method used by the duplicate item link repository and
* detect duplicates submission step.
*
* @throws Exception
*/
@Test
public void testSearchDuplicates() throws Exception {
// Get potential duplicates of item 1:
// Expected: Public item II should appear as it has the configured levenshtein distance of 1
List<PotentialDuplicate> potentialDuplicates = duplicateDetectionService.getPotentialDuplicates(context, item1);
// Make sure result list is size 1
int size = 1;
assertEquals("Potential duplicates of item1 should have size " + size,
size, potentialDuplicates.size());
// The only member should be Public item II (one distance from public item I)
assertEquals("Item II should be be the detected duplicate",
item2.getID(), potentialDuplicates.get(0).getUuid());
// Get potential duplicates of item2:
// Expected: BOTH other items should appear as they are both 1 distance away from "Public item II"
potentialDuplicates = duplicateDetectionService.getPotentialDuplicates(context, item2);
// Sort by title
potentialDuplicates.sort(Comparator.comparing(PotentialDuplicate::getTitle));
// Make sure result list is size 1
size = 2;
assertEquals("Potential duplicates of item2 should have size " + size,
size, potentialDuplicates.size());
// The result list should contain both item1 and item3 in the expected order
assertEquals("item1 should be the first detected duplicate",
item1.getID(), potentialDuplicates.get(0).getUuid());
assertEquals("item3 should be be the second detected duplicate",
item3.getID(), potentialDuplicates.get(1).getUuid());
// Check metadata is populated as per configuration, using item1 (first in results)
// Check for date
Optional<String> foundDate = potentialDuplicates.get(0).getMetadataValueList().stream()
.filter(metadataValue -> metadataValue.getMetadataField().toString('.').equals("dc.date.issued"))
.map(MetadataValue::getValue).findFirst();
assertThat("There should be an issue date found", foundDate.isPresent());
assertEquals("item1 issue date should match the duplicate obj metadata issue date",
item1IssueDate, foundDate.get());
// Check for subject
Optional<String> foundSubject = potentialDuplicates.get(0).getMetadataValueList().stream()
.filter(metadataValue -> metadataValue.getMetadataField().toString('.').equals("dc.subject"))
.map(MetadataValue::getValue).findFirst();
assertThat("There should be a subject found", foundSubject.isPresent());
assertEquals("item1 subject should match the duplicate obj metadata subject",
item1Subject, foundSubject.get());
// Check for author, which was NOT configured to be copied
Optional<String> foundAuthor = potentialDuplicates.get(0).getMetadataValueList().stream()
.filter(metadataValue -> metadataValue.getMetadataField().toString('.')
.equals("dc.contributor.author"))
.map(MetadataValue::getValue).findFirst();
assertThat("There should NOT be an author found", foundAuthor.isEmpty());
}
/**
* Test that a search for getPotentialDuplicates properly escapes Solr reserved characters
* e.g. + - && | | ! ( ) { } [ ] ^ " ~ * ? : \
*
* @throws Exception
*/
@Test
public void testSearchDuplicatesWithReservedSolrCharacters() throws Exception {
Item item4 = ItemBuilder.createItem(context, col)
.withTitle("Testing: An Important Development Step")
.withIssueDate(item1IssueDate)
.withAuthor(item1Author)
.withSubject(item1Subject)
.build();
Item item5 = ItemBuilder.createItem(context, col)
.withTitle("Testing an important development step")
.withIssueDate("2012-10-17")
.withAuthor("Smith, Donald X.")
.withSubject("ExtraEntry 2")
.build();
// Get potential duplicates of item 4 and make sure no exceptions are thrown
List<PotentialDuplicate> potentialDuplicates = new ArrayList<>();
try {
potentialDuplicates = duplicateDetectionService.getPotentialDuplicates(context, item4);
} catch (SearchServiceException e) {
fail("Duplicate search with special characters should NOT result in search exception (" +
e.getMessage() + ")");
}
// Make sure result list is size 1
int size = 1;
assertEquals("Potential duplicates of item1 should have size " + size,
size, potentialDuplicates.size());
// The only member should be item 5
assertEquals("Item 5 should be be the detected duplicate",
item5.getID(), potentialDuplicates.get(0).getUuid());
}
@Test
public void testSearchDuplicatesInWorkflow() throws Exception {
// Get potential duplicates of item 1:
// Expected: Public item II should appear as it has the configured levenshtein distance of 1
context.turnOffAuthorisationSystem();
//context.setDispatcher("default");
XmlWorkflowItem workflowItem1 = WorkflowItemBuilder.createWorkflowItem(context, workflowCol)
.withTitle("Unique title")
.withSubmitter(eperson)
.build();
XmlWorkflowItem workflowItem2 = WorkflowItemBuilder.createWorkflowItem(context, workflowCol)
.withTitle("Unique title")
.withSubmitter(eperson)
.build();
//indexingService.commit();
context.restoreAuthSystemState();
context.setCurrentUser(admin);
List<PotentialDuplicate> potentialDuplicates =
duplicateDetectionService.getPotentialDuplicates(context, workflowItem1.getItem());
// Make sure result list is size 1
int size = 1;
assertEquals("Potential duplicates of item1 should have size " + size,
size, potentialDuplicates.size());
// The only member should be workflow item 2
assertEquals("Workflow item 2 should be be the detected duplicate",
workflowItem2.getItem().getID(), potentialDuplicates.get(0).getUuid());
}
}