mirror of
https://github.com/DSpace/DSpace.git
synced 2025-10-07 01:54:22 +00:00
Merge pull request #431 from hardyoyo/DS-1841-add-examples-for-spider-filtering-by-domain-and-agent
DS-1841 adding example files for agent and domain-based spider filtering
This commit is contained in:
224
dspace/config/spiders/agents/example
Normal file
224
dspace/config/spiders/agents/example
Normal file
@@ -0,0 +1,224 @@
|
||||
# example spider filter by agent string regular expressions courtesy of OSU Libraries
|
||||
# https://raw.github.com/osulibraries/DSpace/osukb/dspace/config/Spiders-UserAgent.txt
|
||||
Alexandria(\s|\+)prototype(\s|\+)project
|
||||
AllenTrack
|
||||
Arachmo
|
||||
Brutus\/AET
|
||||
China\sLocal\sBrowse\s2\.6
|
||||
Code\sSample\sWeb\sClient
|
||||
ContentSmartz
|
||||
DSurf
|
||||
DataCha0s\/2\.0
|
||||
Demo\sBot
|
||||
EmailSiphon
|
||||
EmailWolf
|
||||
FDM(\s|\+)1
|
||||
Fetch(\s|\+)API(\s|\+)Request
|
||||
GetRight
|
||||
Goldfire(\s|\+)Server
|
||||
Googlebot
|
||||
HTTrack
|
||||
LOCKSS
|
||||
LWP\:\:Simple
|
||||
MSNBot
|
||||
Microsoft(\s|\+)URL(\s|\+)Control
|
||||
Milbot
|
||||
MuscatFerre
|
||||
NABOT
|
||||
NaverBot
|
||||
Offline(\s|\+)Navigator
|
||||
OurBrowser
|
||||
Python\-urllib
|
||||
Readpaper
|
||||
Strider
|
||||
T\-H\-U\-N\-D\-E\-R\-S\-T\-O\-N\-E
|
||||
Teleport(\s|\+)Pro
|
||||
Teoma
|
||||
Wanadoo
|
||||
Web(\s|\+)Downloader
|
||||
WebCloner
|
||||
WebCopier
|
||||
WebReaper
|
||||
WebStripper
|
||||
WebZIP
|
||||
Webinator
|
||||
Webmetrics
|
||||
Wget
|
||||
Xenu(\s|\+)Link(\s|\+)Sleuth
|
||||
[+:,\.\;\/\\-]bot
|
||||
[^a]fish
|
||||
^voyager\/
|
||||
acme\.spider
|
||||
alexa
|
||||
almaden
|
||||
appie
|
||||
architext
|
||||
archive\.org_bot
|
||||
arks
|
||||
asterias
|
||||
atomz
|
||||
autoemailspider
|
||||
awbot
|
||||
baiduspider
|
||||
bbot
|
||||
biadu
|
||||
biglotron
|
||||
bjaaland
|
||||
blaiz\-bee
|
||||
bloglines
|
||||
blogpulse
|
||||
boitho\.com\-dc
|
||||
bookmark\-manager
|
||||
bot
|
||||
bot[+:,\.\;\/\\-]
|
||||
bspider
|
||||
bwh3_user_agent
|
||||
celestial
|
||||
cfnetwork|checkbot
|
||||
combine
|
||||
commons\-httpclient
|
||||
contentmatch
|
||||
core
|
||||
crawl
|
||||
crawler
|
||||
cursor
|
||||
custo
|
||||
daumoa
|
||||
docomo
|
||||
dtSearchSpider
|
||||
dumbot
|
||||
easydl
|
||||
exabot
|
||||
fast-webcrawler
|
||||
favorg
|
||||
feedburner
|
||||
feedfetcher\-google
|
||||
ferret
|
||||
findlinks
|
||||
gaisbot
|
||||
geturl
|
||||
gigabot
|
||||
girafabot
|
||||
gnodspider
|
||||
google
|
||||
grub
|
||||
gulliver
|
||||
harvest
|
||||
heritrix
|
||||
hl_ftien_spider
|
||||
holmes
|
||||
htdig
|
||||
htmlparser
|
||||
httpget\-5\.2\.2
|
||||
httpget\?5\.2\.2
|
||||
httrack
|
||||
iSiloX
|
||||
ia_archiver
|
||||
ichiro
|
||||
iktomi
|
||||
ilse
|
||||
internetseer
|
||||
intute
|
||||
java
|
||||
java\/
|
||||
jeeves
|
||||
jobo
|
||||
kyluka
|
||||
larbin
|
||||
libwww
|
||||
libwww\-perl
|
||||
lilina
|
||||
linkbot
|
||||
linkcheck
|
||||
linkchecker
|
||||
linkscan
|
||||
linkwalker
|
||||
livejournal\.com
|
||||
lmspider
|
||||
lwp
|
||||
lwp\-request
|
||||
lwp\-tivial
|
||||
lwp\-trivial
|
||||
lycos[_+]
|
||||
mail.ru
|
||||
mediapartners\-google
|
||||
megite
|
||||
milbot
|
||||
mimas
|
||||
mj12bot
|
||||
mnogosearch
|
||||
moget
|
||||
mojeekbot
|
||||
momspider
|
||||
motor
|
||||
msiecrawler
|
||||
msnbot
|
||||
myweb
|
||||
nagios
|
||||
netcraft
|
||||
netluchs
|
||||
ng\/2\.
|
||||
no_user_agent
|
||||
nomad
|
||||
nutch
|
||||
ocelli
|
||||
onetszukaj
|
||||
perman
|
||||
pioneer
|
||||
playmusic\.com
|
||||
playstarmusic\.com
|
||||
powermarks
|
||||
psbot
|
||||
python
|
||||
qihoobot
|
||||
rambler
|
||||
redalert|robozilla
|
||||
robot
|
||||
robots
|
||||
rss
|
||||
scan4mail
|
||||
scientificcommons
|
||||
scirus
|
||||
scooter
|
||||
seekbot
|
||||
seznambot
|
||||
shoutcast
|
||||
slurp
|
||||
sogou
|
||||
speedy
|
||||
spider
|
||||
spiderman
|
||||
spiderview
|
||||
sunrise
|
||||
superbot
|
||||
surveybot
|
||||
tailrank
|
||||
technoratibot
|
||||
titan
|
||||
turnitinbot
|
||||
twiceler
|
||||
ucsd
|
||||
ultraseek
|
||||
urlaliasbuilder
|
||||
urllib
|
||||
virus[_+]detector
|
||||
voila
|
||||
w3c\-checklink
|
||||
webcollage
|
||||
weblayers
|
||||
webmirror
|
||||
webreaper
|
||||
wordpress
|
||||
worm
|
||||
xenu
|
||||
y!j
|
||||
yacy
|
||||
yahoo
|
||||
yahoo\-mmcrawler
|
||||
yahoofeedseeker
|
||||
yahooseeker
|
||||
yandex
|
||||
yodaobot
|
||||
zealbot
|
||||
zeus
|
||||
zyborg
|
12
dspace/config/spiders/domains/example
Normal file
12
dspace/config/spiders/domains/example
Normal file
@@ -0,0 +1,12 @@
|
||||
# example spider filder by domain regular expressions courtesy of OSU Libraries
|
||||
# https://raw.github.com/osulibraries/DSpace/osukb/dspace/config/Spiders-DomainName.txt
|
||||
(.*)\.fastsearch\.net\.
|
||||
(.*)\.scoutjet\.com\.
|
||||
(.*)\.yahoo\.com\.
|
||||
crawl(.*)\.exabot\.com\.
|
||||
crawl-(.*)-googlebot\.com\.
|
||||
crawler(.*)\.ask\.com\.
|
||||
discobot-(.*)\.discoveryengine\.com\.
|
||||
localhost\.
|
||||
spider(.*)\.yandex\.ru\.
|
||||
spider-(.*)\.yandex\.com\.
|
Reference in New Issue
Block a user