forked from eldy/AWStats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
robots.pm
2786 lines (2760 loc) · 77.3 KB
/
robots.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# AWSTATS ROBOTS DATABASE
#-------------------------------------------------------
# If you want to add robots to extend AWStats database detection capabilities,
# you must add an entry in RobotsSearchIDOrder_listx and RobotsHashIDLib.
# The entry in RobotsSearchIDOrder_listx is a Perl regular expression
# (see http://perldoc.perl.org/perlreref.html). AWSTats applies these
# expressions to the user agent string in the order given by the lists. The
# first match specifies the robot.
#
# Note: This regular expression must not contain any whitespace.
# Otherwise AWStats will produce lines in the database that
# will be misinterpreted and as a consequence the corresponding data in the
# generated HTML reports will be wrong. If you want to match whitespace in
# the user agent string, use other constructs like '\s', '[:blank:]',
# '\p{IsSpace}', '\x20' etc.
#
# The corresponding entry in RobotsHashIDLib contains the regular expression
# as key, followed by a string containing HTML-text. AWStats inserts this
# text into reports to describe the bot. If possible the text should contain
# a link to the bot home page. This makes it easier for sysadmins to find
# the information necessary e.g. to adapt the robots.txt file.
#
# An entry in the RobotsAffiliateLib is not necessary. An entry in this list
# contains as first part the regular expression specifying the bot. The
# second part is a string that gives the Company or product managing the bot.
# This information is not used yet.
#
# There are several sorts of bots that AWStats is not able to detect and
# therefore a considerable amount of bot generated traffic counts
# as user traffic:
#
# a) A crawler that identifies itself in the referrer string, but not in
# the user agent string. An example is the crawler from semalt.semalt.com.
#
# b) Crawlers that correctly access robots.txt but identify themselves in
# in the user agent string only once or just a few times. Most of the
# time a user agent string ist used that does not contain hints that
# a bot is involved. An example is the iCjobs spider.
# msnbot-UDiscovery/2.0b seems to show this behaviour too.
#
#
#
#-------------------------------------------------------
# 2023-07-04 RobC
# Removed Dalvik as native Android UI Browser User Agent
# Removed CFNetwork as native iOS and OSX Browser User Agent
# 2021-05--05 RobC
# Removed Baidu catchall because its picking up baidu.sogo.uc.UCBrowser which is a phone browser
# Added baiduspider- catchall instead
# Newly added from 2021-05-05
# Adsbot
# BW/
# Bytespider
# CheckMarkNetwork/
# DuckDuckBot
# # Foregenix Web Scan
# IonCrawl
# Linguee Bot
# Neevabot
# PetalBot
# TkBot
# vuhuvBot
# 2018-03-13 RobC
# Added 36 robots and one generic ( survey ) using v 7.7 robots file as base.
# Also moved robot "Obot" into generics so that it is singled out as an individual Robot.
#
# 2016-09-02 RobC
# Fixed a few errors and added a few missing bots from awstats 7.5 release.
#
# 2016-08-28 RobC
# Complete re-build of this file almost from scratch.
# dropped many old bots, added many new bots and reordered file.
# edited and added regex expressions to stop spaces causing problems.
# You should tune file by placing the most common robots crawling your site at top
# in List1.
#
#
# N.B. many bots need to be in correct order so don't chnage order without checking if
# change will cause counts to be allocated to wrong bot. Not always simple.
#
#
# 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html
# added dipsie (not tested with real data).
# added DomainsDB.net http://domainsdb.net/
# added ia_archiver-web.archive.org (was inadvertently grouped with Alexa traffic)
# added Nutch (used by looksmart (furl?))
# added rssImagesBot
# added Sqworm
# added t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e
# added w3c css-validator
# added documentation link to bot home pages for above and selected major bots.
# In the case of international bots, choose .com page.
# Included tool tip (html "title").
# To do: parameterize to match both AWStats language and tooltips settings.
# To do: add html links for all bots based on current documentation in source
# files referenced below.
# changed '\wbot[\/\-]', to '\wbot[\/\-]' (removed comma)
# made minor grammar corrections to notes below
# 2005-08-24 added YahooSeeker-Testing
# added w3c-checklink
# updated url for ask.com
# 2005-08-24 added Girafabot http://www.girafa.com/
# 2005-08-30 added PluckFeedCrawler http://www.pluck.com/
# added Gaisbot/3.0 ([email protected]; )
# dded geniebot ([email protected])
# added BecomeBot link http://www.become.com/site_owners.html
# added topicblogs http://www.topicblogs.com/
# added Powermarks; seen used by referrer spam
# added YahooSeeker
# added NG/2. http://www.exabot.com/
# 2005-09-15 added link for Walhello appie
# added bender focused_crawler
# updated YahooSeeker description (blog crawler)
# 2005-09-16 added link for http://linkchecker.sourceforge.net
# added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl)
# added Blogslive [email protected] intelliseek.com
# added BlogPulse (ISSpider-3.0) intelliseek.com
# 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html)
# added EverbeeCrawler
# added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html
# added link for Bloglines http://www.bloglines.com
# 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html)
# added Blogshares Spiders (Synchronized V1.5.1)
# added yacy
# 2005-11-21 added Argus www.simpy.com
# added BlogsSay :: RSS Search Crawler (http://www.blogssay.com/)
# added MJ12bot http://majestic12.co.uk/bot.php
# added OpenTaggerBot (http://www.opentagger.com/opentaggerbot.htm)
# added OutfoxBot/0.3 (For internet experiments; [email protected])
# added RufusBot Rufus Web Miner http://64.124.122.252.webaroo.com/feedback.html
# added Seekbot (http://www.seekbot.net/bot.html)
# added Yahoo-MMCrawler/3.x ([email protected])
# added link for BaiDuSpider
# added link for Blogshares Spider
# added link for StackRambler http://www.rambler.ru/doc/faq.shtml
# added link for WISENutbot
# added link for ZyBorg/1.0 ([email protected]; http://www.WISEnutbot.com. Moved location to above wisenut to avoid classification as wisenut
# 2005-12-15
# added FAST Enteprise Crawler/6 (www dot fastsearch dot com). Note spelling Enteprise not Enterprise.
# added findlinks http://wortschatz.uni-leipzig.de/findlinks/
# added IBM Almaden Research Center WebFountain™ http://www.almaden.ibm.com/cs/crawler [hc3]
# added INFOMINE/8.0 VLCrawler (http://infomine.ucr.edu/useragents)
# added lmspider ([email protected]) http://www.nuance.com/
# added noxtrumbot http://www.noxtrum.com/
# added SandCrawler (Microsoft)
# added SBIder http://www.sitesell.com/sbider.html
# added SeznamBot http://fulltext.seznam.cz/
# added sohu-search http://corp.sohu.com/ (looked for //robots.txt not /robots.txt)
# added the ruffle SemanticWeb crawler v0.5 - http://www.unreach.net
# added WebVulnCrawl/1.0 libwww-perl/5.803 (looked for //robots.txt not /robots.txt)
# added Yahoo! Japan keyoshid http://www.yahoo.co.jp/
# added Y!J http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html
# added link for GigaBot
# added link for MagpieRSS
# added link for MSIECrawler
# 2005-12-21
# added aipbot http://www.aipbot.com [email protected] [matthys70 users.sourceforge.net]
# added Everest-Vulcan Inc./0.1 (R&D project; http://everest.vulcan.com/crawlerhelp)
# added Fast-Search-Engine http://www.fast-search-engine.com/ [matthys70 users.sourceforge.net]
# added g2Crawler ([email protected]) http://crawler.instantnetworks.net/
# added Jakarta commons-httpclient http://jakarta.apache.org/commons/httpclient/ (hit robots.txt). May be used as robot or browser - a site may want to remove this entry.
# added OmniExplorer_Bot http://www.omni-explorer.com/ [matthys70 users.sourceforge.net]
# added USTC-Semantic-Group ai.ustc.edu.cn/mas/en/research/index.php ?
# 2005-12-22
# added EARTHCOM.info www.earthcom.info
# added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor]
# added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor]
# 2006-01-01
# added Dulance http://www.dulance.com/bot.jsp
# added MojeekBot http://www.mojeek.com/bot.html
# added nicebot http://www.egghelp.org/setup.htm ?
# added Snappy http://www.urltrends.com/faq.php
# added sohu agent
# added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net]
# added zspider http://feedback.redkolibri.com/
# 2006-01-13
# added boitho.com-dc http://www.boitho.com/dcbot.html
# added IRLbot http://irl.cs.tamu.edu/crawler
# added virus_detector [email protected]
# added Wavefire http://www.wavefire.com; [email protected]
# added WebFilter Robot
# 2006-01-24
# added Shim-Crawler http://www.logos.ic.i.u-tokyo.ac.jp/crawler/; [email protected]
# added Exabot exabot.com
# added LetsCrawl.com http://letscrawl.com
# added ichiro http://help.goo.ne.jp/door/crawlerE.html
# 2006-01-27 additional 22 robots from a list provided by Moizes Gabor
# added ALeadSoftbot http://www.aleadsoft.com/bot.htm
# added CipinetBot http://www.cipinet.com/bot.html
# added Cuasarbot http://www.cuasar.com/
# added Dumbot http://www.dumbfind.com/
# added Extreme_Picture_Finder http://www.exisoftware.com/
# added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots
# added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html [email protected]
# added InsurancoBot http://www.fastspywareremoval.com/
# added InternetArchive http://lucene.apache.org/nutch/bot.html [email protected]
# added KazoomBot http://www.kazoom.ca/bot.html [email protected]
# added Kurzor http://www.easymail.hu/ [email protected]
# added NutchCVS http://lucene.apache.org/nutch/bot.html [email protected]
# added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html [email protected]
# added Orbiter http://www.dailyorbit.com/bot.htm
# added PHP_version_tracker http://www.nexen.net/phpversion/bot.php
# added SuperBot http://www.sparkleware.com/superbot/
# added SynooBot http://www.synoo.de/bot.html [email protected]
# added TestBot http://www.agbrain.com/
# added TutorGigBot http://www.tutorgig.info/
# added WebIndexer mailto://[email protected]
# added WebMiner http://64.124.122.252/feedback.html
# 2006-02-01
# added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202
# added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164
# additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ]
# added Candlelight_Favorites_Inspector
# added DomainChecker
# added EasyDL
# added FavOrg
# added Favorites_Sweeper
# added Html_Link_Validator
# added Internet_Ninja
# added JRTwine_Software_Check_Favorites_Utility
# fixed Microsoft_URL_Control
# added miniRank
# added Missigua_Locator
# added NPBot
# added Ocelli
# added Onet.pl_SA
# added proodleBot
# added SearchGuild_DMOZ_Experiment
# added Susie
# added Website_Monitoring_Bot
# added Xenu_Link_Sleuth
# 2006-05-15
# added ASPseek http://www.aspseek.org/
# added AdamM Bot http://home.blic.net/adamm/
# added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html
# added arianna.libero.it (Italian Portal/search engine)
# added Biz360 spider http://www.biz360.com
# added BlogBridge Service http://www.blogbridge.com/
# added BlogSearch http://www.icerocket.com/
# added libcrawl
# added edgeio-relanshanbottriever http://www.edgeio.com
# added FeedFlow http://feedflow.com/about
# added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt
# added Java catchall - used by many spam bots
# added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb
# added msnbot-media http://search.msn.com/msnbot.htm
# added MT::Telegraph::Agent
# added Netluchs http://www.netluchs.de/ (German SE bot)
# added oBot http://www.webmasterworld.com/forum11/1616.htm
# added Onfolio http://www.onfolio.com/ (IE Toolbar plugin) - hit rss feeds.
# added ping.blo.gs http://blo.gs/ping.php blog bot
# added Sphere Scout http://www.sphere.com/
# added sproose crawler http://www.sproose.com/bot.html
# added SyndicAPI http://syndicapi.com/bot.html
# added Yahoo! Mindset http://mindset.research.yahoo.com/
# added msrabot
# added Vagabondo & Vagabondo-WAP http://www.wise-guys.nl/Contact/index.php?botselected=webagents)#=uk
# fixed Missigua Locator detection (Missigua_Locator -> Missigua Locator)
# changed echo to echo! to avoid conflict with the bonecho (Firefox 2.0) browser.
# This requires you to reprocess historic logs if you want EchO! to be recognized for older reports.
# 2006-05-17
# added Alpha Search Agent # 62.152.125.60 Eurologon Srl
# added Krugle http://www.krugle.com/crawler/info.html the search engine for developers
# added Octora Beta Bot http://www.octora.com/ # Blog and Rss Search Engine
# added UbiCrawler http://law.dsi.unimi.it/ubicrawler/
# added Yahoo! Slurp China http://misc.yahoo.com.cn/help.html
# You must reprocess old logs for the Yahoo! Slurp China bot to be detected in old reports
# 2006-05-20
# added 1-More Scanner http://www.myzips.com/software/1-More-Scanner.phtml
# added Accoona-AI-Agent http://www.accoona.com/
# added ActiveBookmark http://www.libmaster.com/active_bookmark.php
# added BIGLOTRON http://www.biglotron.com/robot.html
# added Bookmark-Manager http://bkm.sourceforge.net/
# added cbn00glebot
# added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240
# added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork
# added CheckWeb link validator http://p.duby.free.fr/chkweb.htm
# added Computer and Automation Research Institute Crawler http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html
# added ConveraCrawler http://www.authoritativeweb.com/crawl/
# added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/
# added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php
# added Cursor http://adcenter.hu/docs/en/bot.html
# added Custo http://www.netwu.com/custo/
# added DataFountains/DMOZ Downloader http://infomine.ucr.edu/
# added Deepindex http://www.deepindex.net/faq.php
# added DNSGroup http://www.dnsgroup.com/
# added DoCoMo http://www.nttdocomo.co.jp/
# added dumm.de-Bot http://www.dumm.de/
# added ETS v http://www.freetranslation.com/help/
# added eventax http://www.eventax.de/
# added FAST Enterprise Crawler * [email protected] http://www.telekom.de/
# added FAST Enterprise Crawler http://www.fast.no/
# added FAST Enterprise Crawler * T-Info_BI_cluster [email protected] http://www.telekom.de/
# added FeedValidator http://feedvalidator.org/
# added FilmkameraBot http://www.filmkamera.at/bot.html
# added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece
# added Global Fetch http://www.wesonet.com/
# added GOFORITBOT http://www.goforit.com/about/
# added GoForIt.com http://www.goforit.com/about/
# added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php
# added HooWWWer http://cosco.hiit.fi/search/hoowwwer/
# added HPPrint
# added HTMLParser http://htmlparser.sourceforge.net/
# added Hundesuche.com-Bot http://www.hundesuche.com/
# added InfoBot http://www.infobot.org/
# added InfociousBot http://corp.infocious.com/tech_crawler.php
# added InternetSupervision http://internetsupervision.com/
# added isearch2006 http://www.yahoo.com.cn/
# added IUPUI_Research_Bot http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/
# added KalamBot http://64.124.122.251/feedback.html
# added kamano.de NewsFeedVerzeichnis http://www.kamano.de/
# added Kevin http://dznet.com/kevin/
# added KnowItAll http://www.cs.washington.edu/research/knowitall/
# added Knowledge.com http://www.knowledge.com/
# added Kouaa Krawler http://www.kouaa.com/
# added ksibot http://ego.ms.mff.cuni.cz/
# added Link Valet Online http://www.htmlhelp.com/tools/valet/
# added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request
# added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm
# added MapoftheInternet.com http://MapoftheInternet.com/
# added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/
# added Megite http://www.megite.com/
# added Metaspinner http://index.meta-spinner.de/
# added Mini-reptile
# added Misterbot http://www.misterbot.fr/
# added Miva http://www.miva.com/
# added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b
# added MSRBOT http://research.microsoft.com/research/sv/msrbot/
# added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022
# added Mydoyouhike http://www.doyouhike.net/my
# added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b
# added NetSprint http://www.netsprint.pl/serwis/
# added NimbleCrawler http://www.healthline.com/
# added OpenWebSpider http://www.openwebspider.org/
# added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html
# added OSSProxy http://www.marketscore.com/FAQ.Aspx
# added passwordmaker.org http://passwordmaker.org/
# added PEAR HTTP Request class http://pear.php.net/
# added PEERbot http://www.peerbot.com/
# added PHP version tracker http://www.nexen.net/phpversion/bot.php
# added PictureOfInternet http://malfunction.org/poi/
# added plinki http://www.plinki.com/
# added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b
# added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b
# added ProjectWF-java-test-crawler
# added PyQuery http://sourceforge.net/projects/pyquery/
# added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/
# added Scumbot
# added Sensis Web Crawler http://www.sensis.com.au/
# added snap.com beta crawler http://www.snap.com/
# added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/
# added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm
# added Suchfin-Bot http://www.suchfin.de/
# added Sunrise http://www.sunrisexp.com/
# added Tagyu Agent http://www.tagyu.com/
# added Tcl http client package http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm
# added TeragramCrawlerSURF http://www.teragram.com/
# added Test Crawler http://netp.ath.cx/
# added UnChaos Bot Hybrid Web Search Engine http://www.unchaos.com/
# added unido-bot http://www.unchina.org/unido/unido/our_projects/3_3.html
# added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com)
# added updated http://www.updated.com/
# added Vermut http://vermut.aol.com
# added versus crawler from [email protected] http://www.epfl.ch/Eindex.html
# added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb
# added VSE http://www.vivisimo.com/
# added webcrawl.net http://www.webcrawl.net/
# added Web Downloader http://www.krasu.ru/soft/chuchelo/
# added Webdup http://www.webdup.com/en/index.html
# added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b
# added WordPress http://wordpress.org/
# added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/
# added Xenu's Link Sleuth (with ')
# added xirq http://www.xirq.com/
# added yoogliFetchAgent http://www.yoogli.com/
# added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/
# -- fix - some robots were reported with _ where _ should have been a space.
# changed Xenu Link Sleuth
# changed microsoft[_+\s]url[_+\s]control -> microsoft_url_control
# changed favorites_sweeper -> favorites_sweeper
# -- updates
# updated AskJeeves to Ask
# 2012-06-05 Albrecht Mueller
# added Grabber from SDSC (San Diego Supercomputer Center).
# 2013-09-30 Albrecht Mueller
# AWStats probably cannot detect this bot as it identifies itself in
# the referrer field and not in the user agent string.
#92.113.100.35 - - [29/Sep/2013:17:22:46 +0200] "GET /robots.txt HTTP/1.1" 200 516 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-"
#92.113.100.35 - - [29/Sep/2013:17:22:49 +0200] "GET /tghome.htm HTTP/1.1" 200 4445 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-"
#92.113.100.35 - - [29/Sep/2013:17:22:51 +0200] "GET / HTTP/1.1" 200 5467 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-"
# to do MS Search 4.0 Robot
#package AWSROB;
# Robots list was found at http://www.robotstxt.org/wc/active/all.txt
# Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html
# Rem: To avoid bad detection, some robot's ids were removed from this list:
# - Robots with ID of 3 letters only
# - Robots called 'webs' and 'tcl'
# Rem: directhit changed into direct_hit (its real id)
# Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser
# Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser
# Rem: roadrunner changed into road_runner
# Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser
# Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser
# RobotsSearchIDOrder
# It contains all matching criteria to search for in log fields. This list is
# used to know in which order to search Robot IDs.
# Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more
# Minor robots are in list2, used when LevelForRobotsDetection is 2 or more
# Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+\s]' and are quoted.
#-------------------------------------------------------
@RobotsSearchIDOrder_list1 = (
# Common robots (In robot file)
'bingbot/',
'bingpreview',
'MSIECrawler',
'msnbot/',
'msnbot\-media/',
'AdIdxBot/',
'NOT[\x20]Googlebot/',
'Googlebot/',
'Google[\x20]Web[\x20]Preview',
'Googlebot\-Image/',
'Googlebot\-Mobile/',
'Google[\x20]Page[\x20]Speed',
'google\-sitemaps',
'Googlebot\-News',
'Googlebot\-Video/',
'AdsBot\-Google[\x20]\(',
'AdsBot\-Google\-Mobile\-Apps',
'Adsbot',
'Mediapartners-Google',
'Feedfetcher\-Google',
'Google\-Adwords\-Instant',
'Firefox/1\.5',
'Yahoo![\x20]Slurp[\x20]China',
'Yahoo![\x20]Slurp',
'Baiduspider/',
'Baiduspider\-image',
'Baiduspider-',
'YandexBot/',
'YandexImages/',
'YandexImageResizer',
'YandexMetrika/',
'YandexMobileBot/',
'yandex',
'electricmonk/',
'spbot/',
'SeznamBot/',
'msie8',
'AhrefsBot/',
'007ac9[\x20]Crawler',
'2345Explorer/',
'360Spider',
'A[\x20]Simple[\x20]Crawler',
'Abrave',
'acapbot/',
'Accoona\-AI\-Agent/',
'arcemedia',
'AdnormCrawlerCatchBot/',
'adscanner',
'aiHitBot/',
'aipbot/',
'AlphaBot',
'Apache\-HttpClient/',
'Apexoo[\x20]Spider',
'Applebot/',
'archive\.org_bot',
'Babya[\x20]Discoverer',
'Barkrowler',
'BDCbot/',
'BellPagesCA/',
'BeNosey[\x20]Mohawk[\x20]Search',
'bhcBot',
'bidswitchbot',
'BigBozz/',
'BinGet/',
'bitlybot',
'bl\.uk_lddc_bot/',
'BLEXBot/',
'bnf.fr_bot',
'boitho\.com\-dc/',
'BoogleBot',
'BusinessBot:',
'BW/',
'Bytespider',
'CatchBot/',
'CB/Nutch',
'CCBot/',
'CheckMarkNetwork/',
'Cliqzbot/',
'CMS[\x20]Crawler',
'Companybook\-Crawler',
'ConveraCrawler/',
'Contacts-Crawler',
'contxbot',
'cosmos/',
'crawl/Nutch',
'crawler4j',
'CRAZYWEBCRAWLER',
'CRMNLCrawlAgent',
'CSE[\x20]HTML[\x20]Validator',
'C\-T[\x20]bot',
'CUBOT',
'Curl/PHP',
'cyencebot',
'DataCrawler/',
'daumoa',
'daum',
'Deepnet[\x20]Explorer',
'DeuSu/',
'Digincore',
'Discordbot/',
'Dispatch/',
'DnyzBot',
'DoCoMo/',
'Domain[\x20]Re\-Animator[\x20]Bot',
'DomainCrawler/',
'DomainMacroCrawler/',
'DomainSONOCrawler/',
'DomainStatsBot/',
'DotBot/',
'DuckDuckBot-Https',
'DuckDuckBot',
'DuckDuckGo\-Favicons\-Bot/',
'ELinks/',
'ELinks[\x20]\(',
'EmailMarketingRobot/',
'EmeraldShield\.com[\x20]WebBot',
'envolk\[ITS\]spider/',
'eright',
'EsperanzaBot',
'Exabot/',
'ExtLinksBot',
'ExperianCrawlUK',
'facebookexternalhit/',
'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de',
'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de',
'FAST\-WebCrawler/',
'Feosey[\x20]Mohk[\x20]Crawler',
'findlinks/',
'Findxbot/',
'FirePHP/',
'firstdirectory\-bot',
'flamingo',
'FlippyBearBot/',
'^foo$',
'Foregenix[\x20]Web[\x20]Scan',
'FreeWebMonitoring[\x20]SiteChecker/',
'fujilabol',
'FurlBot/',
'Gaisbot/',
'Gallent[\x20]Spider',
'GarlikCrawler/',
'Getintent[\x20]Crawler',
'GetintentCrawler[\x20]getintent\.com',
'Gigabot/',
'gipo\-crawler/Nutch',
'Girafabot',
'Gluten[\x20]Free[\x20]Crawler/',
'gocrawl',
'Gowikibot',
'Go\-http\-client/',
'GrapeshotCrawler/',
'GSiteCrawler/',
'GurujiBot/',
'hadiBot',
'HaosouSpider',
'HELLO[\x20]Crawler',
'holmes/',
'houzzbot',
'HTTP_Request2/',
'HubSpot[\x20]Webcrawler',
'HyperCrawl/',
'ICC\-Crawler/',
'iconoclast',
'IDGCrawler/Nutch',
'IDG/UK',
'idmarch[\x20]Automatic\.beta/',
'InbyBot',
'Incutio[\x20]XML',
'IndeedBot',
'InfluenceBot',
'IonCrawl',
'IRLbot/',
'IssueCrawler',
'istellabot/',
'James[\x20]BOT',
'Jigsaw/',
'JobFeed',
'Jooblebot',
'KomodiaBot/',
'Konqueror/',
'laserlikebot',
'Lightspeed',
'linkapediabot',
'metager\-linkchecker',
'Linguee[\x20]Bot',
'linkchecker',
'LinkCheck',
'linkdexbot/',
'LinkedInBot/',
'LinkpadBot/',
'Links[\x20]\(',
'LinksManager\.com_bot',
'LWP::Simple/',
'Mail\.RU_Bot/',
'makecontact',
'mappy',
'MauiBot',
'meanpathbot/',
'Mechanize',
'Mediatoolkitbot',
'MegaIndex\.ru/',
'merzscope',
'Meta_Bot',
'mfibot/',
'microsoft.*discovery',
'missigua_locator',
'MixrankBot',
'MJ12bot/',
'MojeekBot',
'Mojolicious',
'MXT/Nutch',
'My[\x20]Nutch[\x20]Spider/',
'myse/Nutch',
'Naaraa',
'Neevabot',
'NerdyBot',
'netEstate[\x20]NE[\x20]Crawler',
'NetResearchServer/',
'Nimbostratus-Bot',
'nominet',
'NRLCorpusBuilder/Nutch',
'nutch\-1\.4/',
'nutch\-1\.8/',
'NutchCVS/',
'o\.uk[\x20]robot',
'ocrawler;',
'ODP[\x20]link[\x20]checker',
'Offline[\x20]Explorer/',
'OmniExplorer_Bot/',
'OrangeBot/',
'Orliac',
'OutclicksBot',
'PageBitesHyperBot/',
'Pcore',
'pdffillerbot/',
'peopleman',
'PetalBot',
'PhantomJS',
'PHP/5\.2\.8',
'Pinterestbot',
'PiplBot',
'Ploetz[\x20]\+[\x20]Zeller',
'Plukkie/',
'Princetonbot/',
'PrivacyAwareBot/',
'Prlog/',
'proximic',
'psbot/',
'psbot\-image',
'python_wk_crawler',
'Python\-urllib/',
'QCrawl',
'Quick-Crawler',
'ResearchBot',
'roboto',
'rogerbot/',
'RSSingBot',
'RukiCrawler/',
'SafeDNS[\x20]search[\x20]bot/',
'SafeDNSBot',
'SafeSearch[\x20]microdata[\x20]crawler',
'safesearch',
'SBL\-BOT',
'scrapy',
'Screaming[\x20]Frog[\x20]SEO[\x20]Spider/',
'ScreenerBot[\x20]Crawler[\x20]Beta',
'Scrubby',
'Searchie/',
'SecurityResearch\.bot',
'Seekmo',
'semanticbot',
'SemrushBot/',
'SemrushBot-SI',
'seo\-audit\-check\-bot/',
'Seobility',
'SEOkicks\-Robot',
'SEOlyticsCrawler/',
'SEOstats',
'Seosys/Nutch',
'Seoterritory\.com[\x20]bot',
'serendeputy',
'Shim\-Crawler',
'SiteExplorer/',
'siteexplorer\.info',
'siteimprove',
'Slackbot\-LinkExpanding',
'SmabblerBot/',
'Sogou[\x20]web[\x20]spider/',
'special_archiver/',
'Spiderbot/',
'SpuhexBot',
'spyonweb',
'ssearch_bot',
'Streamline3Bot',
'SurdotlyBot/',
'SurveyBot/',
'taiil/Nutch',
'tbot\-nutch',
'TeeRaidBot',
'TelegramBot',
'Test/Nutch',
'Test[\x20]Spider',
'TestCrawler',
'The[\x20]Knowledge[\x20]AI',
'TkBot',
'tracemyfile',
'trendiction',
'TurnitinBot/',
'TurnitinBot',
'TweetmemeBot/',
'UCY/Nutch',
'uni-leipzig\.de',
'Uptimebot/',
'UptimeRobot/',
'URL[\x20]Checker',
'UXCrawlerBot',
'Validator\.nu/',
'vBSEO',
'vBulletin[\x20]via[\x20]PHP',
'vebidoobot',
'vegi[\x20]bot',
'Velen',
'viz/Nutch',
'VoilaBot',
'VORTEX/',
'voyager/',
'vuhuvBot',
'W3C_Validator/',
'W3C\-checklink/',
'WBSearchBot/',
'WbSrch/',
'WeSEE:Ads/PageBot',
'WeSEE:Ads/PictureBot',
'WeSEE_Bot',
'Wget/',
'Who\.is[\x20]Bot',
'wonderbot/',
'woobot/',
'Wotbox/',
'Xaldon[\x20]WebSpider',
'Xenu[\x20]Link[\x20]Sleuth',
'xenu_link_sleuth',
'XML[\x20]Sitemaps[\x20]Generator',
'XoviBot/',
'yacybot',
'Yahoo[\x20]Link[\x20]Preview',
'yak',
'YisouSpider',
'yoozBot',
'Your\-Website\-Sucks',
'zoominfobot',
'zspider/',
'ZumBot/',
# below placed at end to catch some generics
'ng/1\.',
'ng/2\.',
'libwww\-perl',
'urllib',
'javabee',
'projectwf\-java\-test\-crawler',
'java',
'loocalcrawler/nutch',
'nutchosu\-vlib',
'nutch',
'perlcrawler',
'perl',
# old robots using firefox < version 11 not identifying themselves as a robot.
'(firefox/)([0-9]\.|[0-1][0]\.)'
);
@RobotsSearchIDOrder_list2 = (
# Less common robots (In robot file)
'^Mozilla$',
'^mozilla\/3\.0\s\(compatible$',
'^mozilla\/4\.0$',
'^mozilla\/4\.0\s\(compatible;\)$',
'^mozilla\/5\.0$',
'^mozilla\/5\.0\s\(compatible;$',
'^mozilla\/5\.0\s\(en\-us\)$',
'^mozilla\/5\.0\sfirefox\/3\.0\.5$',
'^Mozilla/6\.0[\x20]\(compatible\)$',
'^Mozilla/(.*)Beta[\x20]\(Windows\)',
'MSIE[\x20]2',
'MSIE[\x20]3',
'MSIE[\x20]4',
'MSIE[\x20]5',
'MSIE[\x20]6',
'MSIE\+6\.0\;',
'Windows[\x20]95',
'Windows[\x20]98',
# these could be removed to speed up processing as they are rarely seen
'a6\-indexer',
'abcdatos',
'abonti\.com',
'acme\.spider',
'activebookmark',
'adamm_bot',
'advbot',
'affectv\.co\.uk',
'ahoythehomepagefinder',
'aleadsoftbot',
'alkaline',
'allrati',
'alltop',
'almaden',
'alpha_search_agent',
'anthill',
'antibot',
'aport',
'appie',
'applesyndication',
'arachnophilia',
'arale',
'araneo',
'architext',
'archive\-de\.com',
'aretha',
'argus',
'ariadne',
'arianna\.libero\.it',
'arks',
'aspider',
'aspseek',
'asterias',
'asynchttpclient',
'atn\.txt',
'atomz',
'auresys',
'awbot',
'backlinktest\.com',
'backrub',
'bbot',
'becomebot',
'bender',
'betabot',
'bigbrother',
'biglotron',
'BingLocalSearch',
'bittorrent_bot',
'biz360[_+\s]spider',
'bjaaland',
'blackwidow',
'blindekuh',
'blogbridge[_+\s]service',
'blogged_crawl',
'bloglines',
'bloglovin',
'blogpulse',
'blogsearch',
'blogshares',
'blogslive',
'blogssay',
'bloodhound',
'bncf\.firenze\.sbn\.it/raccolta\.txt',
'bobby',
'bookmark\-manager',
'borg\-bot',
'boris',
'brightnet',
'bruinbot',
'bspider',
'bubing',
'bumblebee',
'butterfly',
'buzztracker',
'cactvschemistryspider',
'calif[^r]',
'candlelight[_+\s]favorites[_+\s]inspector',
'careerbot',
'carpathia',
'cassandra',
'catbot',
'cbn00glebot',
'cerberian_drtrs',
'cfetch',
'cgireader',
'chattertrap',
'check_http',
'checkbot',
'checkweb_link_validator',
'christcrawler',
'churl',
'cienciaficcion',
'cipinetbot',
'imagecoccoc',
'coccoc',
'coldfusion',
'collective',
'combine',
'commons\-httpclient',
'computer_and_automation_research_institute_crawler',
'conceptbot',
'contentmatch',
'converamultimediacrawler',
'coolbot',
'copubbot',
'core',
'covario',
'cruiser',
'cscrawler',
'cuasarbot',
'cursor',
'cusco',
'custo',
'cyberspyder',
'datafountains/dmoz_downloader',
'dataprovider\.com',
'daviesbot',
'daylifefeedfetcher',
'daypopbot',
'deepindex',
'desertrealm',
'deweb',
'dienstspider',
'digger',
'digout4u',
'diibot',
'dipsie\.bot',
'direct_hit',
'discobot',
'dlvr\.it',
'dnabot',
'dnsgroup',
'doccheckbot',
'domainappender',
'domainchecker',
'domainsdb\.net',
'download_express',
'dragonbot',
'dreamwidth',
'drupal',
'dulance',
'dumbot',
'dumm\.de\-bot',
'dwcp',
'e\-collector',
'earthcom\.info',
'easydl',
'ebiness',
'eccp',
'echo!',
'edgeio\-retriever',
'elfinbot',
'emacs',
'emcspider',
'enteprise',
'ernst[:blank:]2\.0',
'esther',
'ets_v',
'eventax',
'everbeecrawler',
'everest\-vulcan',
'evliyacelebi',
'exactseek',
'extreme[_+\s]picture[_+\s]finder',
'ezoom',
'ezresult',
'facebook',
'facebot',
'fast\-search\-engine',
'matrix_s\.p\.a\._\-_fast_enterprise_crawler',
'fast_enterprise_crawler',
'fastbot',
'fastcrawler',
'favicon',
'favorg',
'favorites_sweeper',
'fdse',
'feedburner',
'feedcrawl',
'feedflow',
'feedmyinbox',