mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-17 15:03:18 +00:00
Merge pull request #764 from bram-atmire/DS-2335
DS-2335 Add more default blocks for certain spiders in robots.txt
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
User-agent: *
|
||||
# Disable access to Discovery search and filters
|
||||
Disallow: /discover
|
||||
Disallow: /simple-search
|
||||
|
||||
# The FULL URL to your DSpace sitemaps
|
||||
# The ${dspace.url} will be auto-filled with the value in dspace.cfg
|
||||
@@ -6,14 +9,11 @@ User-agent: *
|
||||
Sitemap: ${dspace.url}/sitemap
|
||||
Sitemap: ${dspace.url}/htmlmap
|
||||
|
||||
# Disable access to Discovery search
|
||||
Disallow: /simple-search
|
||||
|
||||
# Optionally uncomment the following line ONLY if sitemaps are working
|
||||
# and you have verified that your site is being indexed correctly.
|
||||
# Disallow: /browse
|
||||
|
||||
# If you have configured DSpace (Solr-based) Statistics to be publicly
|
||||
# If you have configured DSpace (Solr-based) Statistics to be publicly
|
||||
# accessible, then you may not want this content to be indexed
|
||||
# Disallow: /statistics
|
||||
|
||||
@@ -24,3 +24,119 @@ Disallow: /simple-search
|
||||
# Disallow: /forgot
|
||||
# Disallow: /login
|
||||
# Disallow: /register
|
||||
|
||||
##############
|
||||
# Section with misbehaving bots
|
||||
# The following directives to block specific robots was borrowed from Wikipedia's robots.txt
|
||||
##############
|
||||
|
||||
# advertising-related bots:
|
||||
User-agent: Mediapartners-Google*
|
||||
Disallow: /
|
||||
|
||||
# Crawlers that are kind enough to obey, but which we'd rather not have
|
||||
# unless they're feeding search engines.
|
||||
User-agent: UbiCrawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: DOC
|
||||
Disallow: /
|
||||
|
||||
User-agent: Zao
|
||||
Disallow: /
|
||||
|
||||
# Some bots are known to be trouble, particularly those designed to copy
|
||||
# entire sites. Please obey robots.txt.
|
||||
User-agent: sitecheck.internetseer.com
|
||||
Disallow: /
|
||||
|
||||
User-agent: Zealbot
|
||||
Disallow: /
|
||||
|
||||
User-agent: MSIECrawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: SiteSnagger
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebStripper
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebCopier
|
||||
Disallow: /
|
||||
|
||||
User-agent: Fetch
|
||||
Disallow: /
|
||||
|
||||
User-agent: Offline Explorer
|
||||
Disallow: /
|
||||
|
||||
User-agent: Teleport
|
||||
Disallow: /
|
||||
|
||||
User-agent: TeleportPro
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebZIP
|
||||
Disallow: /
|
||||
|
||||
User-agent: linko
|
||||
Disallow: /
|
||||
|
||||
User-agent: HTTrack
|
||||
Disallow: /
|
||||
|
||||
User-agent: Microsoft.URL.Control
|
||||
Disallow: /
|
||||
|
||||
User-agent: Xenu
|
||||
Disallow: /
|
||||
|
||||
User-agent: larbin
|
||||
Disallow: /
|
||||
|
||||
User-agent: libwww
|
||||
Disallow: /
|
||||
|
||||
User-agent: ZyBORG
|
||||
Disallow: /
|
||||
|
||||
User-agent: Download Ninja
|
||||
Disallow: /
|
||||
|
||||
# Misbehaving: requests much too fast:
|
||||
User-agent: fast
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# If your DSpace is going down because of someone using recursive wget,
|
||||
# you can activate the following rule.
|
||||
#
|
||||
# If your own faculty is bringing down your dspace with recursive wget,
|
||||
# you can advise them to use the --wait option to set the delay between hits.
|
||||
#
|
||||
#User-agent: wget
|
||||
#Disallow: /
|
||||
|
||||
#
|
||||
# The 'grub' distributed client has been *very* poorly behaved.
|
||||
#
|
||||
User-agent: grub-client
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# Doesn't follow robots.txt anyway, but...
|
||||
#
|
||||
User-agent: k2spider
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# Hits many times per second, not acceptable
|
||||
# http://www.nameprotect.com/botinfo.html
|
||||
User-agent: NPBot
|
||||
Disallow: /
|
||||
|
||||
# A capture bot, downloads gazillions of pages with no public benefit
|
||||
# http://www.webreaper.net/
|
||||
User-agent: WebReaper
|
||||
Disallow: /
|
@@ -1,4 +1,7 @@
|
||||
User-agent: *
|
||||
# Disable access to Discovery search and filters
|
||||
Disallow: /discover
|
||||
Disallow: /search-filter
|
||||
|
||||
# The FULL URL to your DSpace sitemaps
|
||||
# The ${dspace.url} will be auto-filled with the value in dspace.cfg
|
||||
@@ -6,10 +9,6 @@ User-agent: *
|
||||
Sitemap: ${dspace.url}/sitemap
|
||||
Sitemap: ${dspace.url}/htmlmap
|
||||
|
||||
# Disable access to Discovery search and filters
|
||||
Disallow: /discover
|
||||
Disallow: /search-filter
|
||||
|
||||
# Optionally uncomment the following line ONLY if sitemaps are working
|
||||
# and you have verified that your site is being indexed correctly.
|
||||
# Disallow: /browse
|
||||
@@ -25,3 +24,119 @@ Disallow: /search-filter
|
||||
# Disallow: /forgot
|
||||
# Disallow: /login
|
||||
# Disallow: /register
|
||||
|
||||
##############
|
||||
# Section with misbehaving bots
|
||||
# The following directives to block specific robots was borrowed from Wikipedia's robots.txt
|
||||
##############
|
||||
|
||||
# advertising-related bots:
|
||||
User-agent: Mediapartners-Google*
|
||||
Disallow: /
|
||||
|
||||
# Crawlers that are kind enough to obey, but which we'd rather not have
|
||||
# unless they're feeding search engines.
|
||||
User-agent: UbiCrawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: DOC
|
||||
Disallow: /
|
||||
|
||||
User-agent: Zao
|
||||
Disallow: /
|
||||
|
||||
# Some bots are known to be trouble, particularly those designed to copy
|
||||
# entire sites. Please obey robots.txt.
|
||||
User-agent: sitecheck.internetseer.com
|
||||
Disallow: /
|
||||
|
||||
User-agent: Zealbot
|
||||
Disallow: /
|
||||
|
||||
User-agent: MSIECrawler
|
||||
Disallow: /
|
||||
|
||||
User-agent: SiteSnagger
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebStripper
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebCopier
|
||||
Disallow: /
|
||||
|
||||
User-agent: Fetch
|
||||
Disallow: /
|
||||
|
||||
User-agent: Offline Explorer
|
||||
Disallow: /
|
||||
|
||||
User-agent: Teleport
|
||||
Disallow: /
|
||||
|
||||
User-agent: TeleportPro
|
||||
Disallow: /
|
||||
|
||||
User-agent: WebZIP
|
||||
Disallow: /
|
||||
|
||||
User-agent: linko
|
||||
Disallow: /
|
||||
|
||||
User-agent: HTTrack
|
||||
Disallow: /
|
||||
|
||||
User-agent: Microsoft.URL.Control
|
||||
Disallow: /
|
||||
|
||||
User-agent: Xenu
|
||||
Disallow: /
|
||||
|
||||
User-agent: larbin
|
||||
Disallow: /
|
||||
|
||||
User-agent: libwww
|
||||
Disallow: /
|
||||
|
||||
User-agent: ZyBORG
|
||||
Disallow: /
|
||||
|
||||
User-agent: Download Ninja
|
||||
Disallow: /
|
||||
|
||||
# Misbehaving: requests much too fast:
|
||||
User-agent: fast
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# If your DSpace is going down because of someone using recursive wget,
|
||||
# you can activate the following rule.
|
||||
#
|
||||
# If your own faculty is bringing down your dspace with recursive wget,
|
||||
# you can advise them to use the --wait option to set the delay between hits.
|
||||
#
|
||||
#User-agent: wget
|
||||
#Disallow: /
|
||||
|
||||
#
|
||||
# The 'grub' distributed client has been *very* poorly behaved.
|
||||
#
|
||||
User-agent: grub-client
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# Doesn't follow robots.txt anyway, but...
|
||||
#
|
||||
User-agent: k2spider
|
||||
Disallow: /
|
||||
|
||||
#
|
||||
# Hits many times per second, not acceptable
|
||||
# http://www.nameprotect.com/botinfo.html
|
||||
User-agent: NPBot
|
||||
Disallow: /
|
||||
|
||||
# A capture bot, downloads gazillions of pages with no public benefit
|
||||
# http://www.webreaper.net/
|
||||
User-agent: WebReaper
|
||||
Disallow: /
|
Reference in New Issue
Block a user