mirror of
https://github.com/DSpace/dspace-angular.git
synced 2025-10-07 10:04:11 +00:00
154 lines
3.3 KiB
Plaintext
154 lines
3.3 KiB
Plaintext
# The URL to the DSpace sitemaps
|
|
# XML sitemap is listed first as it is preferred by most search engines
|
|
Sitemap: <%= origin %>/sitemap_index.xml
|
|
Sitemap: <%= origin %>/sitemap_index.html
|
|
|
|
##########################
|
|
# Default Access Group
|
|
# (NOTE: blank lines are not allowable in a group record)
|
|
##########################
|
|
User-agent: *
|
|
# Disable access to Discovery search and filters; admin pages; processes; submission; workspace; workflow & profile page
|
|
Disallow: /search
|
|
Disallow: /admin/*
|
|
Disallow: /processes
|
|
Disallow: /submit
|
|
Disallow: /workspaceitems
|
|
Disallow: /profile
|
|
Disallow: /workflowitems
|
|
# Crawlers should be able to access entity pages, but not the facet search links present on entity pages
|
|
Disallow: /entities/*?f
|
|
|
|
# Optionally uncomment the following line ONLY if sitemaps are working
|
|
# and you have verified that your site is being indexed correctly.
|
|
# Disallow: /browse/*
|
|
#
|
|
# If you have configured DSpace (Solr-based) Statistics to be publicly
|
|
# accessible, then you may not want this content to be indexed
|
|
# Disallow: /statistics
|
|
#
|
|
# You also may wish to disallow access to the following paths, in order
|
|
# to stop web spiders from accessing user-based content
|
|
# Disallow: /contact
|
|
# Disallow: /feedback
|
|
# Disallow: /forgot
|
|
# Disallow: /login
|
|
# Disallow: /register
|
|
|
|
|
|
##############################
|
|
# Section for misbehaving bots
|
|
# The following directives to block specific robots were borrowed from Wikipedia's robots.txt
|
|
##############################
|
|
|
|
# advertising-related bots:
|
|
User-agent: Mediapartners-Google*
|
|
Disallow: /
|
|
|
|
# Crawlers that are kind enough to obey, but which we'd rather not have
|
|
# unless they're feeding search engines.
|
|
User-agent: UbiCrawler
|
|
Disallow: /
|
|
|
|
User-agent: DOC
|
|
Disallow: /
|
|
|
|
User-agent: Zao
|
|
Disallow: /
|
|
|
|
# Some bots are known to be trouble, particularly those designed to copy
|
|
# entire sites. Please obey robots.txt.
|
|
User-agent: sitecheck.internetseer.com
|
|
Disallow: /
|
|
|
|
User-agent: Zealbot
|
|
Disallow: /
|
|
|
|
User-agent: MSIECrawler
|
|
Disallow: /
|
|
|
|
User-agent: SiteSnagger
|
|
Disallow: /
|
|
|
|
User-agent: WebStripper
|
|
Disallow: /
|
|
|
|
User-agent: WebCopier
|
|
Disallow: /
|
|
|
|
User-agent: Fetch
|
|
Disallow: /
|
|
|
|
User-agent: Offline Explorer
|
|
Disallow: /
|
|
|
|
User-agent: Teleport
|
|
Disallow: /
|
|
|
|
User-agent: TeleportPro
|
|
Disallow: /
|
|
|
|
User-agent: WebZIP
|
|
Disallow: /
|
|
|
|
User-agent: linko
|
|
Disallow: /
|
|
|
|
User-agent: HTTrack
|
|
Disallow: /
|
|
|
|
User-agent: Microsoft.URL.Control
|
|
Disallow: /
|
|
|
|
User-agent: Xenu
|
|
Disallow: /
|
|
|
|
User-agent: larbin
|
|
Disallow: /
|
|
|
|
User-agent: libwww
|
|
Disallow: /
|
|
|
|
User-agent: ZyBORG
|
|
Disallow: /
|
|
|
|
User-agent: Download Ninja
|
|
Disallow: /
|
|
|
|
# Misbehaving: requests much too fast:
|
|
User-agent: fast
|
|
Disallow: /
|
|
|
|
#
|
|
# If your DSpace is going down because of someone using recursive wget,
|
|
# you can activate the following rule.
|
|
#
|
|
# If your own faculty is bringing down your dspace with recursive wget,
|
|
# you can advise them to use the --wait option to set the delay between hits.
|
|
#
|
|
#User-agent: wget
|
|
#Disallow: /
|
|
|
|
#
|
|
# The 'grub' distributed client has been *very* poorly behaved.
|
|
#
|
|
User-agent: grub-client
|
|
Disallow: /
|
|
|
|
#
|
|
# Doesn't follow robots.txt anyway, but...
|
|
#
|
|
User-agent: k2spider
|
|
Disallow: /
|
|
|
|
#
|
|
# Hits many times per second, not acceptable
|
|
# http://www.nameprotect.com/botinfo.html
|
|
User-agent: NPBot
|
|
Disallow: /
|
|
|
|
# A capture bot, downloads gazillions of pages with no public benefit
|
|
# http://www.webreaper.net/
|
|
User-agent: WebReaper
|
|
Disallow: /
|