add robots.txt

2025-10-07 01:54:15 +00:00 · 2020-08-12 13:20:11 +02:00
parent 05a92f8e5b
commit 8f822ba4f7
2 changed files with 155 additions and 0 deletions
--- a/src/robots.txt
+++ b/src/robots.txt
@@ -0,0 +1,151 @@
+# The URL to the DSpace sitemaps
+# XML sitemap is listed first as it is preferred by most search engines
+Sitemap: /sitemap_index.xml
+Sitemap: /sitemap_index.html
+
+##########################
+# Default Access Group
+# (NOTE: blank lines are not allowable in a group record)
+##########################
+User-agent: *
+# Disable access to Discovery search and filters; admin pages; processes; submission; workspace; workflow & profile page
+Disallow: /search
+Disallow: /admin/*
+Disallow: /processes
+Disallow: /submit
+Disallow: /workspaceitems
+Disallow: /profile
+Disallow: /workflowitems
+
+# Optionally uncomment the following line ONLY if sitemaps are working
+# and you have verified that your site is being indexed correctly.
+# Disallow: /browse/*
+#
+# If you have configured DSpace (Solr-based) Statistics to be publicly
+# accessible, then you may not want this content to be indexed
+# Disallow: /statistics
+#
+# You also may wish to disallow access to the following paths, in order
+# to stop web spiders from accessing user-based content
+# Disallow: /contact
+# Disallow: /feedback
+# Disallow: /forgot
+# Disallow: /login
+# Disallow: /register
+
+
+##############################
+# Section for misbehaving bots
+# The following directives to block specific robots were borrowed from Wikipedia's robots.txt
+##############################
+
+# advertising-related bots:
+User-agent: Mediapartners-Google*
+Disallow: /
+
+# Crawlers that are kind enough to obey, but which we'd rather not have
+# unless they're feeding search engines.
+User-agent: UbiCrawler
+Disallow: /
+
+User-agent: DOC
+Disallow: /
+
+User-agent: Zao
+Disallow: /
+
+# Some bots are known to be trouble, particularly those designed to copy
+# entire sites. Please obey robots.txt.
+User-agent: sitecheck.internetseer.com
+Disallow: /
+
+User-agent: Zealbot
+Disallow: /
+
+User-agent: MSIECrawler
+Disallow: /
+
+User-agent: SiteSnagger
+Disallow: /
+
+User-agent: WebStripper
+Disallow: /
+
+User-agent: WebCopier
+Disallow: /
+
+User-agent: Fetch
+Disallow: /
+
+User-agent: Offline Explorer
+Disallow: /
+
+User-agent: Teleport
+Disallow: /
+
+User-agent: TeleportPro
+Disallow: /
+
+User-agent: WebZIP
+Disallow: /
+
+User-agent: linko
+Disallow: /
+
+User-agent: HTTrack
+Disallow: /
+
+User-agent: Microsoft.URL.Control
+Disallow: /
+
+User-agent: Xenu
+Disallow: /
+
+User-agent: larbin
+Disallow: /
+
+User-agent: libwww
+Disallow: /
+
+User-agent: ZyBORG
+Disallow: /
+
+User-agent: Download Ninja
+Disallow: /
+
+# Misbehaving: requests much too fast:
+User-agent: fast
+Disallow: /
+
+#
+# If your DSpace is going down because of someone using recursive wget,
+# you can activate the following rule.
+#
+# If your own faculty is bringing down your dspace with recursive wget,
+# you can advise them to use the --wait option to set the delay between hits.
+#
+#User-agent: wget
+#Disallow: /
+
+#
+# The 'grub' distributed client has been *very* poorly behaved.
+#
+User-agent: grub-client
+Disallow: /
+
+#
+# Doesn't follow robots.txt anyway, but...
+#
+User-agent: k2spider
+Disallow: /
+
+#
+# Hits many times per second, not acceptable
+# http://www.nameprotect.com/botinfo.html
+User-agent: NPBot
+Disallow: /
+
+# A capture bot, downloads gazillions of pages with no public benefit
+# http://www.webreaper.net/
+User-agent: WebReaper
+Disallow: /
--- a/webpack/webpack.common.ts
+++ b/webpack/webpack.common.ts
@@ -21,6 +21,10 @@ export const copyWebpackOptions = [
  }, {
    from: path.join(__dirname, '..', 'src', 'assets', 'i18n'),
    to: path.join('assets', 'i18n')
+  }, {
+    from: path.join(__dirname, '..', 'src', 'robots.txt'),
+    to: path.join('robots.txt')
+  }
 ];

 export const commonExports = {