diff --git a/dspace-jspui/src/main/webapp/robots.txt b/dspace-jspui/src/main/webapp/robots.txt index d2a94fecfd..4faadbe5ee 100644 --- a/dspace-jspui/src/main/webapp/robots.txt +++ b/dspace-jspui/src/main/webapp/robots.txt @@ -1,4 +1,7 @@ User-agent: * +# Disable access to Discovery search and filters +Disallow: /discover +Disallow: /simple-search # The FULL URL to your DSpace sitemaps # The ${dspace.url} will be auto-filled with the value in dspace.cfg @@ -6,14 +9,11 @@ User-agent: * Sitemap: ${dspace.url}/sitemap Sitemap: ${dspace.url}/htmlmap -# Disable access to Discovery search -Disallow: /simple-search - # Optionally uncomment the following line ONLY if sitemaps are working # and you have verified that your site is being indexed correctly. # Disallow: /browse -# If you have configured DSpace (Solr-based) Statistics to be publicly +# If you have configured DSpace (Solr-based) Statistics to be publicly # accessible, then you may not want this content to be indexed # Disallow: /statistics @@ -24,3 +24,119 @@ Disallow: /simple-search # Disallow: /forgot # Disallow: /login # Disallow: /register + +############## +# Section with misbehaving bots +# The following directives to block specific robots was borrowed from Wikipedia's robots.txt +############## + +# advertising-related bots: +User-agent: Mediapartners-Google* +Disallow: / + +# Crawlers that are kind enough to obey, but which we'd rather not have +# unless they're feeding search engines. +User-agent: UbiCrawler +Disallow: / + +User-agent: DOC +Disallow: / + +User-agent: Zao +Disallow: / + +# Some bots are known to be trouble, particularly those designed to copy +# entire sites. Please obey robots.txt. +User-agent: sitecheck.internetseer.com +Disallow: / + +User-agent: Zealbot +Disallow: / + +User-agent: MSIECrawler +Disallow: / + +User-agent: SiteSnagger +Disallow: / + +User-agent: WebStripper +Disallow: / + +User-agent: WebCopier +Disallow: / + +User-agent: Fetch +Disallow: / + +User-agent: Offline Explorer +Disallow: / + +User-agent: Teleport +Disallow: / + +User-agent: TeleportPro +Disallow: / + +User-agent: WebZIP +Disallow: / + +User-agent: linko +Disallow: / + +User-agent: HTTrack +Disallow: / + +User-agent: Microsoft.URL.Control +Disallow: / + +User-agent: Xenu +Disallow: / + +User-agent: larbin +Disallow: / + +User-agent: libwww +Disallow: / + +User-agent: ZyBORG +Disallow: / + +User-agent: Download Ninja +Disallow: / + +# Misbehaving: requests much too fast: +User-agent: fast +Disallow: / + +# +# If your DSpace is going down because of someone using recursive wget, +# you can activate the following rule. +# +# If your own faculty is bringing down your dspace with recursive wget, +# you can advise them to use the --wait option to set the delay between hits. +# +#User-agent: wget +#Disallow: / + +# +# The 'grub' distributed client has been *very* poorly behaved. +# +User-agent: grub-client +Disallow: / + +# +# Doesn't follow robots.txt anyway, but... +# +User-agent: k2spider +Disallow: / + +# +# Hits many times per second, not acceptable +# http://www.nameprotect.com/botinfo.html +User-agent: NPBot +Disallow: / + +# A capture bot, downloads gazillions of pages with no public benefit +# http://www.webreaper.net/ +User-agent: WebReaper +Disallow: / \ No newline at end of file diff --git a/dspace-xmlui/src/main/webapp/static/robots.txt b/dspace-xmlui/src/main/webapp/static/robots.txt index a80f051453..ffc18f2cf4 100644 --- a/dspace-xmlui/src/main/webapp/static/robots.txt +++ b/dspace-xmlui/src/main/webapp/static/robots.txt @@ -1,4 +1,7 @@ User-agent: * +# Disable access to Discovery search and filters +Disallow: /discover +Disallow: /search-filter # The FULL URL to your DSpace sitemaps # The ${dspace.url} will be auto-filled with the value in dspace.cfg @@ -6,10 +9,6 @@ User-agent: * Sitemap: ${dspace.url}/sitemap Sitemap: ${dspace.url}/htmlmap -# Disable access to Discovery search and filters -Disallow: /discover -Disallow: /search-filter - # Optionally uncomment the following line ONLY if sitemaps are working # and you have verified that your site is being indexed correctly. # Disallow: /browse @@ -25,3 +24,119 @@ Disallow: /search-filter # Disallow: /forgot # Disallow: /login # Disallow: /register + +############## +# Section with misbehaving bots +# The following directives to block specific robots was borrowed from Wikipedia's robots.txt +############## + +# advertising-related bots: +User-agent: Mediapartners-Google* +Disallow: / + +# Crawlers that are kind enough to obey, but which we'd rather not have +# unless they're feeding search engines. +User-agent: UbiCrawler +Disallow: / + +User-agent: DOC +Disallow: / + +User-agent: Zao +Disallow: / + +# Some bots are known to be trouble, particularly those designed to copy +# entire sites. Please obey robots.txt. +User-agent: sitecheck.internetseer.com +Disallow: / + +User-agent: Zealbot +Disallow: / + +User-agent: MSIECrawler +Disallow: / + +User-agent: SiteSnagger +Disallow: / + +User-agent: WebStripper +Disallow: / + +User-agent: WebCopier +Disallow: / + +User-agent: Fetch +Disallow: / + +User-agent: Offline Explorer +Disallow: / + +User-agent: Teleport +Disallow: / + +User-agent: TeleportPro +Disallow: / + +User-agent: WebZIP +Disallow: / + +User-agent: linko +Disallow: / + +User-agent: HTTrack +Disallow: / + +User-agent: Microsoft.URL.Control +Disallow: / + +User-agent: Xenu +Disallow: / + +User-agent: larbin +Disallow: / + +User-agent: libwww +Disallow: / + +User-agent: ZyBORG +Disallow: / + +User-agent: Download Ninja +Disallow: / + +# Misbehaving: requests much too fast: +User-agent: fast +Disallow: / + +# +# If your DSpace is going down because of someone using recursive wget, +# you can activate the following rule. +# +# If your own faculty is bringing down your dspace with recursive wget, +# you can advise them to use the --wait option to set the delay between hits. +# +#User-agent: wget +#Disallow: / + +# +# The 'grub' distributed client has been *very* poorly behaved. +# +User-agent: grub-client +Disallow: / + +# +# Doesn't follow robots.txt anyway, but... +# +User-agent: k2spider +Disallow: / + +# +# Hits many times per second, not acceptable +# http://www.nameprotect.com/botinfo.html +User-agent: NPBot +Disallow: / + +# A capture bot, downloads gazillions of pages with no public benefit +# http://www.webreaper.net/ +User-agent: WebReaper +Disallow: / \ No newline at end of file