diff --git a/config/config.example.yml b/config/config.example.yml index 27155f74c1..52b06b1b08 100644 --- a/config/config.example.yml +++ b/config/config.example.yml @@ -43,17 +43,37 @@ cache: maxBufferSize: 100 timePerMethod: PATCH: 3 # time in seconds - # In-memory cache of server-side rendered pages. This cache stores the most recently accessed public pages. - # Pages are automatically added/dropped from this cache based on how recently they have been used. + # In-memory cache(s) of server-side rendered pages. These caches will store the most recently accessed public pages. + # Pages are automatically added/dropped from these caches based on how recently they have been used. + # Restarting the app clears all page caches. + # NOTE: To control the cache size, use the "max" setting. Keep in mind, individual cached pages are usually small (<100KB). + # Enabling *both* caches will mean that a page may be cached twice, once in each cache (but may expire at different times via timeToLive). serverSide: - # Maximum number of pages to cache. Set to zero (0) to disable server side caching. Default is 100, which means - # the 100 most recently accessed public pages will be cached. As all pages are cached in server memory, - # increasing this value will increase memory needs. Individual cached pages are usually small (<100KB), - # so max=100 should only require a maximum of 9-10MB of memory. Restarting the app clears this page cache. - max: 100 - # Amount of time after which cached pages are considered stale (in ms). After becoming stale, the cached - # copy is automatically refreshed on the next request. - timeToLive: 900000 # 15 minutes + # When enabled (i.e. max > 0), known bots will be sent pages from a server side cache specific for bots. + # (Keep in mind, bot detection cannot be guarranteed. It is possible some bots will bypass this cache.) + botCache: + # Maximum number of pages to cache for known bots. Set to zero (0) to disable server side caching for bots. + # Default is 1000, which means the 1000 most recently accessed public pages will be cached. + # As all pages are cached in server memory, increasing this value will increase memory needs. + # Individual cached pages are usually small (<100KB), so max=1000 should only require ~100MB of memory. + max: 1000 + # Amount of time after which cached pages are considered stale (in ms). After becoming stale, the cached + # copy is automatically refreshed on the next request. + # NOTE: For the bot cache, this setting may impact how quickly search engine bots will index new content on your site. + # For example, setting this to one week may mean that search engine bots may not find all new content for one week. + timeToLive: 86400000 # 1 day + # When enabled (i.e. max > 0), all anonymous users will be sent pages from a server side cache. + # This allows anonymous users to interact more quickly with the site, but also means they may see slightly + # outdated content (based on timeToLive) + anonymousCache: + # Maximum number of pages to cache. Default is zero (0) which means anonymous user cache is disabled. + # As all pages are cached in server memory, increasing this value will increase memory needs. + # Individual cached pages are usually small (<100KB), so a value of max=1000 would only require ~100MB of memory. + max: 0 + # Amount of time after which cached pages are considered stale (in ms). After becoming stale, the cached + # copy is automatically refreshed on the next request. + # NOTE: For the anonymous cache, it is recommended to keep this value low to avoid anonymous users seeing outdated content. + timeToLive: 10000 # 10 seconds # Authentication settings auth: diff --git a/package.json b/package.json index 945bb1f158..184f1a5647 100644 --- a/package.json +++ b/package.json @@ -99,6 +99,7 @@ "fast-json-patch": "^3.0.0-1", "filesize": "^6.1.0", "http-proxy-middleware": "^1.0.5", + "isbot": "^3.6.5", "js-cookie": "2.2.1", "js-yaml": "^4.1.0", "json5": "^2.2.2", diff --git a/server.ts b/server.ts index 04a0e36670..8c9835cf16 100644 --- a/server.ts +++ b/server.ts @@ -29,6 +29,7 @@ import * as expressStaticGzip from 'express-static-gzip'; import axios from 'axios'; import LRU from 'lru-cache'; +import isbot from 'isbot'; import { createCertificate } from 'pem'; import { createServer } from 'https'; import { json } from 'body-parser'; @@ -70,8 +71,11 @@ const cookieParser = require('cookie-parser'); const appConfig: AppConfig = buildAppConfig(join(DIST_FOLDER, 'assets/config.json')); -// cache of SSR pages, only enabled in production mode -let cache: LRU; +// cache of SSR pages for known bots, only enabled in production mode +let botCache: LRU; + +// cache of SSR pages for anonymous users. Disabled by default, and only available in production mode +let anonymousCache: LRU; // extend environment with app config for server extendEnvironmentWithAppConfig(environment, appConfig); @@ -257,10 +261,10 @@ function serverSideRender(req, res, sendToUser: boolean = true) { providers: [{ provide: APP_BASE_HREF, useValue: req.baseUrl }] }, (err, data) => { if (hasNoValue(err) && hasValue(data)) { - res.locals.ssr = true; // mark response as SSR (enables text compression) - // save server side rendered data to cache + // save server side rendered page to cache (if any are enabled) saveToCache(req, data); if (sendToUser) { + res.locals.ssr = true; // mark response as SSR (enables text compression) // send rendered page to user res.send(data); } @@ -313,24 +317,45 @@ function addCacheControl(req, res, next) { * Initialize server-side caching of pages rendered via SSR. */ function initCache() { - if (cacheEnabled()) { - // Initialize a new "least-recently-used" item cache (where least recently used items are removed first) + if (botCacheEnabled()) { + // Initialize a new "least-recently-used" item cache (where least recently used pages are removed first) // See https://www.npmjs.com/package/lru-cache - cache = new LRU( { - max: environment.cache.serverSide.max || 100, // 100 items in cache maximum - ttl: environment.cache.serverSide.timeToLive || 15 * 60 * 1000, // 15 minute cache + // When enabled, each page defaults to expiring after 1 day + botCache = new LRU( { + max: environment.cache.serverSide.botCache.max, + ttl: environment.cache.serverSide.botCache.timeToLive || 24 * 60 * 60 * 1000, // 1 day + allowStale: true // If object is found to be stale, return stale value before deleting + }); + } + + if (anonymousCacheEnabled()) { + // NOTE: While caches may share SSR pages, this cache must be kept separately because the timeToLive + // may expire pages more frequently. + // When enabled, each page defaults to expiring after 10 seconds (to minimize anonymous users seeing out-of-date content) + anonymousCache = new LRU( { + max: environment.cache.serverSide.anonymousCache.max, + ttl: environment.cache.serverSide.anonymousCache.timeToLive || 10 * 1000, // 10 seconds allowStale: true // If object is found to be stale, return stale value before deleting }); } } /** - * Return whether server side caching is enabled in configuration. + * Return whether bot-specific server side caching is enabled in configuration. */ -function cacheEnabled(): boolean { - // Caching is only enabled is SSR is enabled AND - // "serverSide.max" setting is greater than zero - return environment.universal.preboot && environment.cache.serverSide.max && (environment.cache.serverSide.max > 0); +function botCacheEnabled(): boolean { + // Caching is only enabled if SSR is enabled AND + // "max" pages to cache is greater than zero + return environment.universal.preboot && environment.cache.serverSide.botCache.max && (environment.cache.serverSide.botCache.max > 0); +} + +/** + * Return whether anonymous user server side caching is enabled in configuration. + */ +function anonymousCacheEnabled(): boolean { + // Caching is only enabled if SSR is enabled AND + // "max" pages to cache is greater than zero + return environment.universal.preboot && environment.cache.serverSide.anonymousCache.max && (environment.cache.serverSide.anonymousCache.max > 0); } /** @@ -338,43 +363,64 @@ function cacheEnabled(): boolean { * Caching is ONLY done for SSR requests. Pages are cached base on their path (e.g. /home or /search?query=test) */ function cacheCheck(req, res, next) { - let cacheHit = false; - let debug = false; // Enable to see cache hits & re-rendering logs + // Cached copy of page (if found) + let cachedCopy; - // Only check cache if cache enabled & NOT authenticated. - // NOTE: Authenticated users cannot use the SSR cache. Cached pages only show data available to anonymous users. - // Only public pages can currently be cached, as the cached data is not user-specific. - if (cacheEnabled() && !isUserAuthenticated(req)) { - const key = getCacheKey(req); + // If the bot cache is enabled and this request looks like a bot, check the bot cache for a cached page. + if (botCacheEnabled() && isbot(req.get('user-agent'))) { + cachedCopy = checkCacheForRequest('bot', botCache, req, res); + } else if (anonymousCacheEnabled() && !isUserAuthenticated(req)) { + cachedCopy = checkCacheForRequest('anonymous', anonymousCache, req, res); + } - // Check if this page is in our cache - let cachedCopy = cache.get(key); - if (cachedCopy) { - cacheHit = true; - res.locals.ssr = true; // mark response as SSR (enables text compression) - if (debug) { console.log(`CACHE HIT FOR ${key}`); } - // return page from cache to user - res.send(cachedCopy); + // If cached copy exists, return it to the user. + if (cachedCopy) { + res.locals.ssr = true; // mark response as SSR-generated (enables text compression) + res.send(cachedCopy); - // Check if cached copy is expired (in this sitution key will now be gone from cache) - if (!cache.has(key)) { - if (debug) { console.log(`CACHE EXPIRED FOR ${key} Re-rendering...`); } - // Update cached copy by rerendering server-side - // NOTE: Cached copy was already returned to user above. So, this re-render is just to prepare for next user. - serverSideRender(req, res, false); - } + // Tell Express to skip all other handlers for this path + // This ensures we don't try to re-render the page since we've already returned the cached copy + next('router'); + } else { + // If nothing found in cache, just continue with next handler + // (This should send the request on to the handler that rerenders the page via SSR + next(); + } +} - // Tell Express to skip all other handlers for this path - // This ensures we don't try to re-render the page since we've already returned the cached copy - next('router'); +/** + * Checks if the current request (i.e. page) is found in the given cache. If it is found, + * the cached copy is returned. When found, this method also triggers a re-render via + * SSR if the cached copy is now expired (i.e. timeToLive has passed for this cached copy). + * @param cacheName name of cache (just useful for debug logging) + * @param cache LRU cache to check + * @param req current request to look for in the cache + * @param res current response + * @returns cached copy (if found) or undefined (if not found) + */ +function checkCacheForRequest(cacheName: string, cache: LRU, req, res): any { + let debug = false; // Enable to see cache hits & re-rendering in logs + + // Get the cache key for this request + const key = getCacheKey(req); + + // Check if this page is in our cache + let cachedCopy = cache.get(key); + if (cachedCopy) { + if (debug) { console.log(`CACHE HIT FOR ${key} in ${cacheName} cache`); } + + // Check if cached copy is expired (If expired, the key will now be gone from cache) + if (!cache.has(key)) { + if (debug) { console.log(`CACHE EXPIRED FOR ${key} in ${cacheName} cache. Re-rendering...`); } + // Update cached copy by rerendering server-side + // NOTE: In this scenario the currently cached copy will be returned to the current user. + // This re-render is peformed behind the scenes to update cached copy for next user. + serverSideRender(req, res, false); } } - // If nothing found in cache, just continue with next handler - // (This should send the request on to the handler that rerenders the page via SSR) - if (!cacheHit) { - next(); - } + // return page from cache + return cachedCopy; } /** @@ -390,20 +436,30 @@ function getCacheKey(req): string { } /** - * Save data to server side cache, if enabled. If caching is not enabled or user is authenticated, this is a noop + * Save page to server side cache(s), if enabled. If caching is not enabled or a user is authenticated, this is a noop + * If multiple caches are enabled, the page will be saved to any caches where it does not yet exist (or is expired). + * (This minimizes the number of times we need to run SSR on the same page.) * @param req current page request - * @param data page data to save to cache + * @param page page data to save to cache */ -function saveToCache(req, data: any) { - // Only cache if caching is enabled and no one is currently authenticated. This means ONLY public pages can be cached. +function saveToCache(req, page: any) { + // Only cache if no one is currently authenticated. This means ONLY public pages can be cached. // NOTE: It's not safe to save page data to the cache when a user is authenticated. In that situation, // the page may include sensitive or user-specific materials. As the cache is shared across all users, it can only contain public info. - if (cacheEnabled() && !isUserAuthenticated(req)) { + if (!isUserAuthenticated(req)) { const key = getCacheKey(req); - // Make sure this key is not already in our cache. If "has()" returns true, - // then it's in the cache already and *not* expired. - if (!cache.has(key)) { - cache.set(key, data); + // Avoid caching "/reload/[random]" paths (these are hard refreshes after logout) + if (key.startsWith('/reload')) { return; } + + // If bot cache is enabled, save it to that cache if it doesn't exist or is expired + // (NOTE: has() will return false if page is expired in cache) + if (botCacheEnabled() && !botCache.has(key)) { + botCache.set(key, page); + } + + // If anonymous cache is enabled, save it to that cache if it doesn't exist or is expired + if (anonymousCacheEnabled() && !anonymousCache.has(key)) { + anonymousCache.set(key, page); } } } @@ -412,7 +468,7 @@ function saveToCache(req, data: any) { * Whether a user is authenticated or not */ function isUserAuthenticated(req): boolean { - // Check whether our authentication Cookie exists or not + // Check whether our DSpace authentication Cookie exists or not return req.cookies[TOKENITEM]; } diff --git a/src/config/cache-config.interface.ts b/src/config/cache-config.interface.ts index d0dfc677d9..1826bd0d30 100644 --- a/src/config/cache-config.interface.ts +++ b/src/config/cache-config.interface.ts @@ -8,11 +8,22 @@ export interface CacheConfig extends Config { // Cache-Control HTTP Header control: string; autoSync: AutoSyncConfig; - // In-memory cache of server-side rendered content + // In-memory caches of server-side rendered (SSR) content. These caches can be used to limit the frequency + // of re-generating SSR pages to improve performance. serverSide: { - // Maximum number of pages (rendered via SSR) to cache. - max: number; - // Amount of time after which cached pages are considered stale (in ms) - timeToLive: number; + // Cache specific to known bots. Allows you to serve cached contents to bots only. + botCache: { + // Maximum number of pages (rendered via SSR) to cache. Setting max=0 disables the cache. + max: number; + // Amount of time after which cached pages are considered stale (in ms) + timeToLive: number; + }, + // Cache specific to anonymous users. Allows you to serve cached content to non-authenticated users. + anonymousCache: { + // Maximum number of pages (rendered via SSR) to cache. Setting max=0 disables the cache. + max: number; + // Amount of time after which cached pages are considered stale (in ms) + timeToLive: number; + } } } diff --git a/src/config/default-app-config.ts b/src/config/default-app-config.ts index a1ac29e8de..9e5b535872 100644 --- a/src/config/default-app-config.ts +++ b/src/config/default-app-config.ts @@ -76,10 +76,22 @@ export class DefaultAppConfig implements AppConfig { }, // In-memory cache of server-side rendered content serverSide: { - // Maximum number of pages (rendered via SSR) to cache. Set to zero to disable server side caching. - max: 100, - // Amount of time after which cached pages are considered stale (in ms) - timeToLive: 15 * 60 * 1000 // 15 minutes + // Cache specific to known bots. Allows you to serve cached contents to bots only. + // Defaults to caching 1,000 pages. Each page expires after 1 day + botCache: { + // Maximum number of pages (rendered via SSR) to cache. Setting max=0 disables the cache. + max: 1000, + // Amount of time after which cached pages are considered stale (in ms) + timeToLive: 24 * 60 * 60 * 1000, // 1 day + }, + // Cache specific to anonymous users. Allows you to serve cached content to non-authenticated users. + // Defaults to caching 0 pages. But, when enabled, each page expires after 10 seconds (to minimize anonymous users seeing out-of-date content) + anonymousCache: { + // Maximum number of pages (rendered via SSR) to cache. Setting max=0 disables the cache. + max: 0, // disabled by default + // Amount of time after which cached pages are considered stale (in ms) + timeToLive: 10 * 1000, // 10 seconds + } } }; diff --git a/yarn.lock b/yarn.lock index 96693299dc..3843ac4dab 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6749,6 +6749,11 @@ isbinaryfile@^4.0.8: resolved "https://registry.yarnpkg.com/isbinaryfile/-/isbinaryfile-4.0.10.tgz#0c5b5e30c2557a2f06febd37b7322946aaee42b3" integrity sha512-iHrqe5shvBUcFbmZq9zOQHBoeOhZJu6RQGrDpBgenUm/Am+F3JM2MgQj+rK3Z601fzrL5gLZWtAPH2OBaSVcyw== +isbot@^3.6.5: + version "3.6.5" + resolved "https://registry.yarnpkg.com/isbot/-/isbot-3.6.5.tgz#a749980d9dfba9ebcc03ee7b548d1f24dd8c9f1e" + integrity sha512-BchONELXt6yMad++BwGpa0oQxo/uD0keL7N15cYVf0A1oMIoNQ79OqeYdPMFWDrNhCqCbRuw9Y9F3QBjvAxZ5g== + isexe@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10"