Merge pull request #764 from bram-atmire/DS-2335

DS-2335 Add more default blocks for certain spiders in robots.txt
2025-10-17 15:03:18 +00:00 · 2014-12-17 15:25:49 -06:00
parent f0e7c4c102 5c3eb75149
commit c43f050265
2 changed files with 239 additions and 8 deletions
--- a/dspace-jspui/src/main/webapp/robots.txt
+++ b/dspace-jspui/src/main/webapp/robots.txt
@@ -1,4 +1,7 @@
 User-agent: *
+# Disable access to Discovery search and filters
+Disallow: /discover 
+Disallow: /simple-search

 # The FULL URL to your DSpace sitemaps
 # The ${dspace.url} will be auto-filled with the value in dspace.cfg
@@ -6,14 +9,11 @@ User-agent: *
 Sitemap: ${dspace.url}/sitemap
 Sitemap: ${dspace.url}/htmlmap

-# Disable access to Discovery search
-Disallow: /simple-search
-
 # Optionally uncomment the following line ONLY if sitemaps are working
 # and you have verified that your site is being indexed correctly.
 # Disallow: /browse

-# If you have configured DSpace (Solr-based) Statistics to be publicly
+# If you have configured DSpace (Solr-based) Statistics to be publicly 
 # accessible, then you may not want this content to be indexed
 # Disallow: /statistics

@@ -24,3 +24,119 @@ Disallow: /simple-search
 # Disallow: /forgot
 # Disallow: /login
 # Disallow: /register
+
+##############
+# Section with misbehaving bots
+# The following directives to block specific robots was borrowed from Wikipedia's robots.txt
+##############
+
+# advertising-related bots:
+User-agent: Mediapartners-Google*
+Disallow: /
+
+# Crawlers that are kind enough to obey, but which we'd rather not have
+# unless they're feeding search engines.
+User-agent: UbiCrawler
+Disallow: /
+
+User-agent: DOC
+Disallow: /
+
+User-agent: Zao
+Disallow: /
+
+# Some bots are known to be trouble, particularly those designed to copy
+# entire sites. Please obey robots.txt.
+User-agent: sitecheck.internetseer.com
+Disallow: /
+
+User-agent: Zealbot
+Disallow: /
+
+User-agent: MSIECrawler
+Disallow: /
+
+User-agent: SiteSnagger
+Disallow: /
+
+User-agent: WebStripper
+Disallow: /
+
+User-agent: WebCopier
+Disallow: /
+
+User-agent: Fetch
+Disallow: /
+
+User-agent: Offline Explorer
+Disallow: /
+
+User-agent: Teleport
+Disallow: /
+
+User-agent: TeleportPro
+Disallow: /
+
+User-agent: WebZIP
+Disallow: /
+
+User-agent: linko
+Disallow: /
+
+User-agent: HTTrack
+Disallow: /
+
+User-agent: Microsoft.URL.Control
+Disallow: /
+
+User-agent: Xenu
+Disallow: /
+
+User-agent: larbin
+Disallow: /
+
+User-agent: libwww
+Disallow: /
+
+User-agent: ZyBORG
+Disallow: /
+
+User-agent: Download Ninja
+Disallow: /
+
+# Misbehaving: requests much too fast:
+User-agent: fast
+Disallow: /
+
+#
+# If your DSpace is going down because of someone using recursive wget, 
+# you can activate the following rule.
+#
+# If your own faculty is bringing down your dspace with recursive wget,
+# you can advise them to use the --wait option to set the delay between hits.
+#
+#User-agent: wget
+#Disallow: /
+
+#
+# The 'grub' distributed client has been *very* poorly behaved.
+#
+User-agent: grub-client
+Disallow: /
+
+#
+# Doesn't follow robots.txt anyway, but...
+#
+User-agent: k2spider
+Disallow: /
+
+#
+# Hits many times per second, not acceptable
+# http://www.nameprotect.com/botinfo.html
+User-agent: NPBot
+Disallow: /
+
+# A capture bot, downloads gazillions of pages with no public benefit
+# http://www.webreaper.net/
+User-agent: WebReaper
+Disallow: /
--- a/dspace-xmlui/src/main/webapp/static/robots.txt
+++ b/dspace-xmlui/src/main/webapp/static/robots.txt
@@ -1,4 +1,7 @@
 User-agent: *
+# Disable access to Discovery search and filters
+Disallow: /discover 
+Disallow: /search-filter

 # The FULL URL to your DSpace sitemaps
 # The ${dspace.url} will be auto-filled with the value in dspace.cfg
@@ -6,10 +9,6 @@ User-agent: *
 Sitemap: ${dspace.url}/sitemap
 Sitemap: ${dspace.url}/htmlmap

-# Disable access to Discovery search and filters
-Disallow: /discover 
-Disallow: /search-filter
-
 # Optionally uncomment the following line ONLY if sitemaps are working
 # and you have verified that your site is being indexed correctly.
 # Disallow: /browse
@@ -25,3 +24,119 @@ Disallow: /search-filter
 # Disallow: /forgot
 # Disallow: /login
 # Disallow: /register
+
+##############
+# Section with misbehaving bots
+# The following directives to block specific robots was borrowed from Wikipedia's robots.txt
+##############
+
+# advertising-related bots:
+User-agent: Mediapartners-Google*
+Disallow: /
+
+# Crawlers that are kind enough to obey, but which we'd rather not have
+# unless they're feeding search engines.
+User-agent: UbiCrawler
+Disallow: /
+
+User-agent: DOC
+Disallow: /
+
+User-agent: Zao
+Disallow: /
+
+# Some bots are known to be trouble, particularly those designed to copy
+# entire sites. Please obey robots.txt.
+User-agent: sitecheck.internetseer.com
+Disallow: /
+
+User-agent: Zealbot
+Disallow: /
+
+User-agent: MSIECrawler
+Disallow: /
+
+User-agent: SiteSnagger
+Disallow: /
+
+User-agent: WebStripper
+Disallow: /
+
+User-agent: WebCopier
+Disallow: /
+
+User-agent: Fetch
+Disallow: /
+
+User-agent: Offline Explorer
+Disallow: /
+
+User-agent: Teleport
+Disallow: /
+
+User-agent: TeleportPro
+Disallow: /
+
+User-agent: WebZIP
+Disallow: /
+
+User-agent: linko
+Disallow: /
+
+User-agent: HTTrack
+Disallow: /
+
+User-agent: Microsoft.URL.Control
+Disallow: /
+
+User-agent: Xenu
+Disallow: /
+
+User-agent: larbin
+Disallow: /
+
+User-agent: libwww
+Disallow: /
+
+User-agent: ZyBORG
+Disallow: /
+
+User-agent: Download Ninja
+Disallow: /
+
+# Misbehaving: requests much too fast:
+User-agent: fast
+Disallow: /
+
+#
+# If your DSpace is going down because of someone using recursive wget, 
+# you can activate the following rule.
+#
+# If your own faculty is bringing down your dspace with recursive wget,
+# you can advise them to use the --wait option to set the delay between hits.
+#
+#User-agent: wget
+#Disallow: /
+
+#
+# The 'grub' distributed client has been *very* poorly behaved.
+#
+User-agent: grub-client
+Disallow: /
+
+#
+# Doesn't follow robots.txt anyway, but...
+#
+User-agent: k2spider
+Disallow: /
+
+#
+# Hits many times per second, not acceptable
+# http://www.nameprotect.com/botinfo.html
+User-agent: NPBot
+Disallow: /
+
+# A capture bot, downloads gazillions of pages with no public benefit
+# http://www.webreaper.net/
+User-agent: WebReaper
+Disallow: /