diff --git a/src/robots.txt b/src/robots.txt new file mode 100644 index 0000000000..04cbb888e0 --- /dev/null +++ b/src/robots.txt @@ -0,0 +1,151 @@ +# The URL to the DSpace sitemaps +# XML sitemap is listed first as it is preferred by most search engines +Sitemap: /sitemap_index.xml +Sitemap: /sitemap_index.html + +########################## +# Default Access Group +# (NOTE: blank lines are not allowable in a group record) +########################## +User-agent: * +# Disable access to Discovery search and filters; admin pages; processes; submission; workspace; workflow & profile page +Disallow: /search +Disallow: /admin/* +Disallow: /processes +Disallow: /submit +Disallow: /workspaceitems +Disallow: /profile +Disallow: /workflowitems + +# Optionally uncomment the following line ONLY if sitemaps are working +# and you have verified that your site is being indexed correctly. +# Disallow: /browse/* +# +# If you have configured DSpace (Solr-based) Statistics to be publicly +# accessible, then you may not want this content to be indexed +# Disallow: /statistics +# +# You also may wish to disallow access to the following paths, in order +# to stop web spiders from accessing user-based content +# Disallow: /contact +# Disallow: /feedback +# Disallow: /forgot +# Disallow: /login +# Disallow: /register + + +############################## +# Section for misbehaving bots +# The following directives to block specific robots were borrowed from Wikipedia's robots.txt +############################## + +# advertising-related bots: +User-agent: Mediapartners-Google* +Disallow: / + +# Crawlers that are kind enough to obey, but which we'd rather not have +# unless they're feeding search engines. +User-agent: UbiCrawler +Disallow: / + +User-agent: DOC +Disallow: / + +User-agent: Zao +Disallow: / + +# Some bots are known to be trouble, particularly those designed to copy +# entire sites. Please obey robots.txt. +User-agent: sitecheck.internetseer.com +Disallow: / + +User-agent: Zealbot +Disallow: / + +User-agent: MSIECrawler +Disallow: / + +User-agent: SiteSnagger +Disallow: / + +User-agent: WebStripper +Disallow: / + +User-agent: WebCopier +Disallow: / + +User-agent: Fetch +Disallow: / + +User-agent: Offline Explorer +Disallow: / + +User-agent: Teleport +Disallow: / + +User-agent: TeleportPro +Disallow: / + +User-agent: WebZIP +Disallow: / + +User-agent: linko +Disallow: / + +User-agent: HTTrack +Disallow: / + +User-agent: Microsoft.URL.Control +Disallow: / + +User-agent: Xenu +Disallow: / + +User-agent: larbin +Disallow: / + +User-agent: libwww +Disallow: / + +User-agent: ZyBORG +Disallow: / + +User-agent: Download Ninja +Disallow: / + +# Misbehaving: requests much too fast: +User-agent: fast +Disallow: / + +# +# If your DSpace is going down because of someone using recursive wget, +# you can activate the following rule. +# +# If your own faculty is bringing down your dspace with recursive wget, +# you can advise them to use the --wait option to set the delay between hits. +# +#User-agent: wget +#Disallow: / + +# +# The 'grub' distributed client has been *very* poorly behaved. +# +User-agent: grub-client +Disallow: / + +# +# Doesn't follow robots.txt anyway, but... +# +User-agent: k2spider +Disallow: / + +# +# Hits many times per second, not acceptable +# http://www.nameprotect.com/botinfo.html +User-agent: NPBot +Disallow: / + +# A capture bot, downloads gazillions of pages with no public benefit +# http://www.webreaper.net/ +User-agent: WebReaper +Disallow: / diff --git a/webpack/webpack.common.ts b/webpack/webpack.common.ts index 3f49cf648a..36e1f2e96b 100644 --- a/webpack/webpack.common.ts +++ b/webpack/webpack.common.ts @@ -21,6 +21,10 @@ export const copyWebpackOptions = [ }, { from: path.join(__dirname, '..', 'src', 'assets', 'i18n'), to: path.join('assets', 'i18n') + }, { + from: path.join(__dirname, '..', 'src', 'robots.txt'), + to: path.join('robots.txt') + } ]; export const commonExports = {