DURACOM-199 fix sitemap generator for restricted content and improve performance

This commit is contained in:
Andrea Bollini
2023-11-07 15:51:23 +01:00
parent dac4df9c1a
commit 6d9ca388da
3 changed files with 133 additions and 90 deletions

View File

@@ -11,7 +11,6 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.Date; import java.util.Date;
import java.util.Iterator;
import java.util.List; import java.util.List;
import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLine;
@@ -24,9 +23,6 @@ import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.content.Item;
import org.dspace.content.factory.ContentServiceFactory; import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.CollectionService; import org.dspace.content.service.CollectionService;
import org.dspace.content.service.CommunityService; import org.dspace.content.service.CommunityService;
@@ -35,6 +31,7 @@ import org.dspace.core.Context;
import org.dspace.core.LogHelper; import org.dspace.core.LogHelper;
import org.dspace.discovery.DiscoverQuery; import org.dspace.discovery.DiscoverQuery;
import org.dspace.discovery.DiscoverResult; import org.dspace.discovery.DiscoverResult;
import org.dspace.discovery.IndexableObject;
import org.dspace.discovery.SearchService; import org.dspace.discovery.SearchService;
import org.dspace.discovery.SearchServiceException; import org.dspace.discovery.SearchServiceException;
import org.dspace.discovery.SearchUtils; import org.dspace.discovery.SearchUtils;
@@ -60,6 +57,7 @@ public class GenerateSitemaps {
private static final ConfigurationService configurationService = private static final ConfigurationService configurationService =
DSpaceServicesFactory.getInstance().getConfigurationService(); DSpaceServicesFactory.getInstance().getConfigurationService();
private static final SearchService searchService = SearchUtils.getSearchService(); private static final SearchService searchService = SearchUtils.getSearchService();
private static final int PAGE_SIZE = 100;
/** /**
* Default constructor * Default constructor
@@ -183,96 +181,113 @@ public class GenerateSitemaps {
} }
Context c = new Context(Context.Mode.READ_ONLY); Context c = new Context(Context.Mode.READ_ONLY);
int offset = 0;
long commsCount = 0;
long collsCount = 0;
long itemsCount = 0;
List<Community> comms = communityService.findAll(c); try {
DiscoverQuery discoveryQuery = new DiscoverQuery();
discoveryQuery.setMaxResults(PAGE_SIZE);
discoveryQuery.setQuery("search.resourcetype:Community");
do {
discoveryQuery.setStart(offset);
DiscoverResult discoverResult = searchService.search(c, discoveryQuery);
List<IndexableObject> docs = discoverResult.getIndexableObjects();
commsCount = discoverResult.getTotalSearchResults();
for (Community comm : comms) { for (IndexableObject doc : docs) {
String url = uiURLStem + "communities/" + comm.getID(); String url = uiURLStem + "communities/" + doc.getID();
c.uncacheEntity(doc.getIndexedObject());
if (makeHTMLMap) {
html.addURL(url, null);
}
if (makeSitemapOrg) {
sitemapsOrg.addURL(url, null);
}
}
offset += PAGE_SIZE;
} while (offset < commsCount);
offset = 0;
discoveryQuery = new DiscoverQuery();
discoveryQuery.setMaxResults(PAGE_SIZE);
discoveryQuery.setQuery("search.resourcetype:Collection");
do {
discoveryQuery.setStart(offset);
DiscoverResult discoverResult = searchService.search(c, discoveryQuery);
List<IndexableObject> docs = discoverResult.getIndexableObjects();
collsCount = discoverResult.getTotalSearchResults();
for (IndexableObject doc : docs) {
String url = uiURLStem + "collections/" + doc.getID();
c.uncacheEntity(doc.getIndexedObject());
if (makeHTMLMap) {
html.addURL(url, null);
}
if (makeSitemapOrg) {
sitemapsOrg.addURL(url, null);
}
}
offset += PAGE_SIZE;
} while (offset < collsCount);
offset = 0;
discoveryQuery = new DiscoverQuery();
discoveryQuery.setMaxResults(PAGE_SIZE);
discoveryQuery.setQuery("search.resourcetype:Item");
discoveryQuery.addSearchField("search.entitytype");
do {
discoveryQuery.setStart(offset);
DiscoverResult discoverResult = searchService.search(c, discoveryQuery);
List<IndexableObject> docs = discoverResult.getIndexableObjects();
itemsCount = discoverResult.getTotalSearchResults();
for (IndexableObject doc : docs) {
String url;
List<String> entityTypeFieldValues = discoverResult.getSearchDocument(doc).get(0)
.getSearchFieldValues("search.entitytype");
if (CollectionUtils.isNotEmpty(entityTypeFieldValues)) {
url = uiURLStem + "entities/" + StringUtils.lowerCase(entityTypeFieldValues.get(0)) + "/"
+ doc.getID();
} else {
url = uiURLStem + "items/" + doc.getID();
}
Date lastMod = doc.getLastModified();
c.uncacheEntity(doc.getIndexedObject());
if (makeHTMLMap) {
html.addURL(url, null);
}
if (makeSitemapOrg) {
sitemapsOrg.addURL(url, null);
}
}
offset += PAGE_SIZE;
} while (offset < itemsCount);
if (makeHTMLMap) { if (makeHTMLMap) {
html.addURL(url, null); int files = html.finish();
log.info(LogHelper.getHeader(c, "write_sitemap",
"type=html,num_files=" + files + ",communities="
+ commsCount + ",collections=" + collsCount
+ ",items=" + itemsCount));
} }
if (makeSitemapOrg) { if (makeSitemapOrg) {
sitemapsOrg.addURL(url, null); int files = sitemapsOrg.finish();
log.info(LogHelper.getHeader(c, "write_sitemap",
"type=html,num_files=" + files + ",communities="
+ commsCount + ",collections=" + collsCount
+ ",items=" + itemsCount));
} }
} catch (SearchServiceException e) {
c.uncacheEntity(comm); throw new RuntimeException(e);
} finally {
c.abort();
} }
List<Collection> colls = collectionService.findAll(c);
for (Collection coll : colls) {
String url = uiURLStem + "collections/" + coll.getID();
if (makeHTMLMap) {
html.addURL(url, null);
}
if (makeSitemapOrg) {
sitemapsOrg.addURL(url, null);
}
c.uncacheEntity(coll);
}
Iterator<Item> allItems = itemService.findAll(c);
int itemCount = 0;
while (allItems.hasNext()) {
Item i = allItems.next();
DiscoverQuery entityQuery = new DiscoverQuery();
entityQuery.setQuery("search.uniqueid:\"Item-" + i.getID() + "\" and entityType:*");
entityQuery.addSearchField("entityType");
try {
DiscoverResult discoverResult = searchService.search(c, entityQuery);
String url;
if (CollectionUtils.isNotEmpty(discoverResult.getIndexableObjects())
&& CollectionUtils.isNotEmpty(discoverResult.getSearchDocument(
discoverResult.getIndexableObjects().get(0)).get(0).getSearchFieldValues("entityType"))
&& StringUtils.isNotBlank(discoverResult.getSearchDocument(
discoverResult.getIndexableObjects().get(0)).get(0).getSearchFieldValues("entityType").get(0))
) {
url = uiURLStem + "entities/" + StringUtils.lowerCase(discoverResult.getSearchDocument(
discoverResult.getIndexableObjects().get(0))
.get(0).getSearchFieldValues("entityType").get(0)) + "/" + i.getID();
} else {
url = uiURLStem + "items/" + i.getID();
}
Date lastMod = i.getLastModified();
if (makeHTMLMap) {
html.addURL(url, lastMod);
}
if (makeSitemapOrg) {
sitemapsOrg.addURL(url, lastMod);
}
} catch (SearchServiceException e) {
log.error("Failed getting entitytype through solr for item " + i.getID() + ": " + e.getMessage());
}
c.uncacheEntity(i);
itemCount++;
}
if (makeHTMLMap) {
int files = html.finish();
log.info(LogHelper.getHeader(c, "write_sitemap",
"type=html,num_files=" + files + ",communities="
+ comms.size() + ",collections=" + colls.size()
+ ",items=" + itemCount));
}
if (makeSitemapOrg) {
int files = sitemapsOrg.finish();
log.info(LogHelper.getHeader(c, "write_sitemap",
"type=html,num_files=" + files + ",communities="
+ comms.size() + ",collections=" + colls.size()
+ ",items=" + itemCount));
}
c.abort();
} }
} }

View File

@@ -1031,9 +1031,8 @@ public class SolrServiceImpl implements SearchService, IndexingService {
// Add information about our search fields // Add information about our search fields
for (String field : searchFields) { for (String field : searchFields) {
List<String> valuesAsString = new ArrayList<>(); List<String> valuesAsString = new ArrayList<>();
for (Object o : doc.getFieldValues(field)) { Optional.ofNullable(doc.getFieldValues(field))
valuesAsString.add(String.valueOf(o)); .ifPresent(l -> l.forEach(o -> valuesAsString.add(String.valueOf(o))));
}
resultDoc.addSearchField(field, valuesAsString.toArray(new String[valuesAsString.size()])); resultDoc.addSearchField(field, valuesAsString.toArray(new String[valuesAsString.size()]));
} }
result.addSearchDocument(indexableObject, resultDoc); result.addSearchDocument(indexableObject, resultDoc);

View File

@@ -236,8 +236,37 @@ public class SitemapRestControllerIT extends AbstractControllerIntegrationTest {
.andReturn(); .andReturn();
String response = result.getResponse().getContentAsString(); String response = result.getResponse().getContentAsString();
// contains a link to communities: [dspace.ui.url]/communities/<uuid>
assertTrue(response
.contains(configurationService.getProperty("dspace.ui.url") + "/communities/" + community.getID()));
// contains a link to collections: [dspace.ui.url]/collections/<uuid>
assertTrue(response
.contains(configurationService.getProperty("dspace.ui.url") + "/collections/" + collection.getID()));
// contains a link to items: [dspace.ui.url]/items/<uuid> // contains a link to items: [dspace.ui.url]/items/<uuid>
assertTrue(response.contains(configurationService.getProperty("dspace.ui.url") + "/items/" + item1.getID())); assertTrue(response.contains(configurationService.getProperty("dspace.ui.url") + "/items/" + item1.getID()));
assertTrue(response.contains(configurationService.getProperty("dspace.ui.url") + "/items/" + item2.getID())); assertTrue(response.contains(configurationService.getProperty("dspace.ui.url") + "/items/" + item2.getID()));
// contains proper link to entities items
assertTrue(response.contains(configurationService.getProperty("dspace.ui.url") + "/entities/publication/"
+ entityPublication.getID()));
assertFalse(response
.contains(configurationService.getProperty("dspace.ui.url") + "/items/" + entityPublication.getID()));
// does not contain links to restricted content
assertFalse(response.contains(
configurationService.getProperty("dspace.ui.url") + "/communities/" + communityRestricted.getID()));
assertFalse(response.contains(
configurationService.getProperty("dspace.ui.url") + "/collections/" + collectionRestricted.getID()));
assertFalse(response
.contains(configurationService.getProperty("dspace.ui.url") + "/items/" + itemRestricted.getID()));
assertFalse(response.contains(configurationService.getProperty("dspace.ui.url") + "/entities/publication/"
+ entityPublicationRestricted.getID()));
assertFalse(response.contains(
configurationService.getProperty("dspace.ui.url") + "/items/" + entityPublicationRestricted.getID()));
// does not contain links to undiscoverable content
assertFalse(response
.contains(configurationService.getProperty("dspace.ui.url") + "/items/" + itemUndiscoverable.getID()));
assertFalse(response.contains(configurationService.getProperty("dspace.ui.url") + "/entities/publication/"
+ entityPublicationUndiscoverable.getID()));
assertFalse(response.contains(configurationService.getProperty("dspace.ui.url") + "/items/"
+ entityPublicationUndiscoverable.getID()));
} }
} }