-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
references.bib
executable file
·2968 lines (2854 loc) · 305 KB
/
references.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{Ashburner2003,
author = {Ashburner, M and Mungall, C J and Lewis, S E},
issn = {0091-7451},
journal = {Cold Spring Harbor symposia on quantitative biology},
mendeley-groups = {Thesis try 2},
pages = {227--35},
pmid = {15338622},
title = {{ Ontologies for biologists: a community model for the annotation of genomic data. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/15338622},
volume = {68},
year = {2003}
}
@article{Zerbino2018,
abstract = {The Ensembl project has been aggregating, processing, integrating and redistributing genomic datasets since the initial releases of the draft human genome, with the aim of accelerating genomics research through rapid open distribution of public data. Large amounts of raw data are thus transformed into knowledge, which is made available via a multitude of channels, in particular our browser (http://www.ensembl.org). Over time, we have expanded in multiple directions. First, our resources describe multiple fields of genomics, in particular gene annotation, comparative genomics, genetics and epigenomics. Second, we cover a growing number of genome assemblies; Ensembl Release 90 contains exactly 100. Third, our databases feed simultaneously into an array of services designed around different use cases, ranging from quick browsing to genome-wide bioinformatic analysis. We present here the latest developments of the Ensembl project, with a focus on managing an increasing number of assemblies, supporting efforts in genome interpretation and improving our browser.},
author = {Zerbino, Daniel R and Achuthan, Premanand and Akanni, Wasiu and Amode, M Ridwan and Barrell, Daniel and Bhai, Jyothish and Billis, Konstantinos and Cummins, Carla and Gall, Astrid and Gir { \' { o } } n, Carlos Garc { \' { i } } a and Gil, Laurent and Gordon, Leo and Haggerty, Leanne and Haskell, Erin and Hourlier, Thibaut and Izuogu, Osagie G and Janacek, Sophie H and Juettemann, Thomas and To, Jimmy Kiang and Laird, Matthew R and Lavidas, Ilias and Liu, Zhicheng and Loveland, Jane E and Maurel, Thomas and McLaren, William and Moore, Benjamin and Mudge, Jonathan and Murphy, Daniel N and Newman, Victoria and Nuhn, Michael and Ogeh, Denye and Ong, Chuang Kee and Parker, Anne and Patricio, Mateus and Riat, Harpreet Singh and Schuilenburg, Helen and Sheppard, Dan and Sparrow, Helen and Taylor, Kieron and Thormann, Anja and Vullo, Alessandro and Walts, Brandon and Zadissa, Amonida and Frankish, Adam and Hunt, Sarah E and Kostadima, Myrto and Langridge, Nicholas and Martin, Fergal J and Muffato, Matthieu and Perry, Emily and Ruffier, Magali and Staines, Dan M and Trevanion, Stephen J and Aken, Bronwen L and Cunningham, Fiona and Yates, Andrew and Flicek, Paul},
doi = {10.1093/nar/gkx1098},
issn = {1362-4962},
journal = {Nucleic Acids Res.},
mendeley-groups = {Doctoral},
month = {jan},
number = {D1},
pages = {D754--D761},
pmid = {29155950},
title = {{ Ensembl 2018. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/29155950 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC5753206},
volume = {46},
year = {2018}
}
@article{Wright2005,
abstract = {The HGNC Comparison of Orthology Predictions search tool, HCOP (http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/hcop.pl ), enables users to compare predicted human and mouse orthologs for a specified gene, or set of genes, from either species according to the ortholog assertions from the Ensembl, HGNC, Homologene, Inparanoid, MGI and PhIGs databases. Users can assess the reliability of the prediction from the number of these different sources that identify a particular orthologous pair. HCOP provides a useful one-stop resource to summarise, compare and access various sources of human and mouse orthology data.},
author = {Wright, Mathew W and Eyre, Tina A and Lush, Michael J and Povey, Sue and Bruford, Elspeth A},
doi = {10.1007/s00335-005-0103-2},
issn = {0938-8990},
journal = {Mamm. Genome},
mendeley-groups = {Doctoral},
month = {nov},
number = {11},
pages = {827--8},
pmid = {16284797},
title = {{ HCOP: the HGNC comparison of orthology predictions search tool. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/16284797},
volume = {16},
year = {2005}
}
@article{Blake2017,
abstract = {The Mouse Genome Database (MGD: http://www.informatics.jax.org) is the primary community data resource for the laboratory mouse. It provides a highly integrated and highly curated system offering a comprehensive view of current knowledge about mouse genes, genetic markers and genomic features as well as the associations of those features with sequence, phenotypes, functional and comparative information, and their relationships to human diseases. MGD continues to enhance access to these data, to extend the scope of data content and visualizations, and to provide infrastructure and user support that ensures effective and efficient use of MGD in the advancement of scientific knowledge. Here, we report on recent enhancements made to the resource and new features.},
author = {Blake, Judith A. and Eppig, Janan T. and Kadin, James A. and Richardson, Joel E. and Smith, Cynthia L. and Bult, Carol J. and Anagnostopoulos, A. and Baldarelli, R. M. and Beal, J. S. and Bello, S. M. and Blodgett, O. and Butler, N. E. and Corbani, L. E. and Dene, H. and Drabkin, H. J. and Forthofer, K. L. and Giannatto, S. L. and Hale, P. and Hill, D. P. and Hutchins, L. and Knowlton, M. and Lavertu, A. and Law, M. and Lewis, J. R. and Lopez, V. and Maghini, D. and Perry, D. and McAndrews, M. and Miers, D. and Montenko, H. and Ni, L. and Onda, H. and Recla, J. M. and Reed, D. J. and Richards-Smith, B. and Sitnikov, D. and Tomczuk, M. and Wilming, L. and Zhu, Y.},
doi = {10.1093/nar/gkw1040},
file = {:Users/cthoyt/ownCloud/Mendeley/2017/Blake et al. - 2017 - Mouse Genome Database (MGD)-2017 Community knowledge resource for the laboratory mouse.pdf:pdf},
issn = {13624962},
journal = {Nucleic Acids Res.},
mendeley-groups = {Doctoral},
number = {D1},
pages = {D723--D729},
pmid = {27899570},
title = {{ Mouse Genome Database (MGD)-2017: Community knowledge resource for the laboratory mouse }},
volume = {45},
year = {2017}
}
@article{Shimoyama2015,
abstract = {The Rat Genome Database (RGD, http://rgd.mcw.edu) provides the most comprehensive data repository and informatics platform related to the laboratory rat, one of the most important model organisms for disease studies. RGD maintains and updates datasets for genomic elements such as genes, transcripts and increasingly in recent years, sequence variations, as well as map positions for multiple assemblies and sequence information. Functional annotations for genomic elements are curated from published literature, submitted by researchers and integrated from other public resources. Complementing the genomic data catalogs are those associated with phenotypes and disease, including strains, QTL and experimental phenotype measurements across hundreds of strains. Data are submitted by researchers, acquired through bulk data pipelines or curated from published literature. Innovative software tools provide users with an integrated platform to query, mine, display and analyze valuable genomic and phenomic datasets for discovery and enhancement of their own research. This update highlights recent developments that reflect an increasing focus on: (i) genomic variation, (ii) phenotypes and diseases, (iii) data related to the environment and experimental conditions and (iv) datasets and software tools that allow the user to explore and analyze the interactions among these and their impact on disease.},
author = {Shimoyama, Mary and { De Pons } , Jeff and Hayman, G. Thomas and Laulederkind, Stanley J.F. and Liu, Weisong and Nigam, Rajni and Petri, Victoria and Smith, Jennifer R. and Tutaj, Marek and Wang, Shur Jen and Worthey, Elizabeth and Dwinell, Melinda and Jacob, Howard},
doi = {10.1093/nar/gku1026},
file = {:Users/cthoyt/ownCloud/Mendeley/2015/Shimoyama et al. - 2015 - The Rat Genome Database 2015 Genomic, phenotypic and environmental variations and disease.pdf:pdf},
isbn = {0305-1048},
issn = {13624962},
journal = {Nucleic Acids Res.},
mendeley-groups = {Doctoral},
number = {D1},
pages = {D743--D750},
pmid = {25355511},
title = {{ The Rat Genome Database 2015: Genomic, phenotypic and environmental variations and disease }},
volume = {43},
year = {2015}
}
@article{Yates2017,
abstract = {The HUGO Gene Nomenclature Committee (HGNC) based at the European Bioinformatics Institute (EMBL-EBI) assigns unique symbols and names to human genes. Currently the HGNC database contains almost 40 000 approved gene symbols, over 19 000 of which represent protein-coding genes. In addition to naming genomic loci we manually curate genes into family sets based on shared characteristics such as homology, function or phenotype. We have recently updated our gene family resources and introduced new improved visualizations which can be seen alongside our gene symbol reports on our primary website http://www.genenames.org In 2016 we expanded our remit and formed the Vertebrate Gene Nomenclature Committee (VGNC) which is responsible for assigning names to vertebrate species lacking a dedicated nomenclature group. Using the chimpanzee genome as a pilot project we have approved symbols and names for over 14 500 protein-coding genes in chimpanzee, and have developed a new website http://vertebrate.genenames.org to distribute these data. Here, we review our online data and resources, focusing particularly on the improvements and new developments made during the last two years.},
author = {Yates, Bethan and Braschi, Bryony and Gray, Kristian A. and Seal, Ruth L. and Tweedie, Susan and Bruford, Elspeth A.},
doi = {10.1093/nar/gkw1033},
file = {:Users/cthoyt/ownCloud/Mendeley/2017/Yates et al. - 2017 - Genenames.org The HGNC and VGNC resources in 2017.pdf:pdf},
isbn = {13624962 (Electronic)},
issn = {13624962},
journal = {Nucleic Acids Res.},
number = {D1},
pages = {D619--D625},
pmid = {27799471},
title = {{ Genenames.org: The HGNC and VGNC resources in 2017 }},
volume = {45},
year = {2017}
}
@article{Maglott2011,
abstract = {Entrez Gene (www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene) is NCBI's database for gene-specific information. It does not include all known or predicted genes; instead Entrez Gene focuses on the genomes that have been completely sequenced, that have an active research community to contribute gene-specific information, or that are scheduled for intense sequence analysis. The content of Entrez Gene represents the result of curation and automated integration of data from NCBI's Reference Sequence project (RefSeq), from collaborating model organism databases, and from many other databases available from NCBI. Records are assigned unique, stable and tracked integers as identifiers. The content (nomenclature, map location, gene products and their attributes, markers, phenotypes, and links to citations, sequences, variation details, maps, expression, homologs, protein domains and external databases) is updated as new information becomes available. Entrez Gene is a step forward from NCBI's LocusLink, with both a major increase in taxonomic scope and improved access through the many tools associated with NCBI Entrez.},
author = {Maglott, Donna and Ostell, Jim and Pruitt, Kim D. and Tatusova, Tatiana},
doi = {10.1093/nar/gkq1237},
file = {:Users/cthoyt/ownCloud/Mendeley/2011/Maglott et al. - 2011 - Entrez gene Gene-centered information at NCBI.pdf:pdf},
isbn = {1362-4962 (Electronic)$\backslash$n0305-1048 (Linking)},
issn = {03051048},
journal = {Nucleic Acids Res.},
number = {SUPPL. 1},
pages = {52--57},
pmid = {15608257},
title = {{ Entrez gene: Gene-centered information at NCBI }},
volume = {39},
year = {2011}
}
@article{Howe2013,
abstract = {ZFIN, the Zebrafish Model Organism Database (http://zfin.org), is the central resource for zebrafish genetic, genomic, phenotypic and developmental data. ZFIN curators manually curate and integrate comprehensive data involving zebrafish genes, mutants, transgenics, phenotypes, genotypes, gene expressions, morpholinos, antibodies, anatomical structures and publications. Integrated views of these data, as well as data gathered through collaborations and data exchanges, are provided through a wide selection of web-based search forms. Among the vertebrate model organisms, zebrafish are uniquely well suited for rapid and targeted generation of mutant lines. The recent rapid production of mutants and transgenic zebrafish is making management of data associated with these resources particularly important to the research community. Here, we describe recent enhancements to ZFIN aimed at improving our support for mutant and transgenic lines, including (i) enhanced mutant/transgenic search functionality; (ii) more expressive phenotype curation methods; (iii) new downloads files and archival data access; (iv) incorporation of new data loads from laboratories undertaking large-scale generation of mutant or transgenic lines and (v) new GBrowse tracks for transgenic insertions, genes with antibodies and morpholinos.},
author = {Howe, Douglas G and Bradford, Yvonne M and Conlin, Tom and Eagle, Anne E and Fashena, David and Frazer, Ken and Knight, Jonathan and Mani, Prita and Martin, Ryan and Moxon, Sierra A Taylor and Paddock, Holly and Pich, Christian and Ramachandran, Sridhar and Ruef, Barbara J and Ruzicka, Leyla and Schaper, Kevin and Shao, Xiang and Singer, Amy and Sprunger, Brock and { Van Slyke } , Ceri E and Westerfield, Monte},
doi = {10.1093/nar/gks938},
issn = {1362-4962},
journal = {Nucleic acids research},
mendeley-groups = {Thesis try 2},
month = {jan},
number = {Database issue},
pages = {D854--60},
pmid = {23074187},
title = {{ ZFIN, the Zebrafish Model Organism Database: increased support for mutants and transgenics. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23074187 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC3531097},
volume = {41},
year = {2013}
}
@article{Thurmond2019,
abstract = {FlyBase (flybase.org) is a knowledge base that supports the community of researchers that use the fruit fly, Drosophila melanogaster, as a model organism. The FlyBase team curates and organizes a diverse array of genetic, molecular, genomic, and developmental information about Drosophila. At the beginning of 2018, 'FlyBase 2.0' was released with a significantly improved user interface and new tools. Among these important changes are a new organization of search results into interactive lists or tables (hitlists), enhanced reference lists, and new protein domain graphics. An important new data class called 'experimental tools' consolidates information on useful fly strains and other resources related to a specific gene, which significantly enhances the ability of the Drosophila researcher to design and carry out experiments. With the release of FlyBase 2.0, there has also been a restructuring of backend architecture and a continued development of application programming interfaces (APIs) for programmatic access to FlyBase data. In this review, we describe these major new features and functionalities of the FlyBase 2.0 site and how they support the use of Drosophila as a model organism for biological discovery and translational research.},
author = {Thurmond, Jim and Goodman, Joshua L and Strelets, Victor B and Attrill, Helen and Gramates, L Sian and Marygold, Steven J and Matthews, Beverley B and Millburn, Gillian and Antonazzo, Giulia and Trovisco, Vitor and Kaufman, Thomas C and Calvi, Brian R and { FlyBase Consortium }},
doi = {10.1093/nar/gky1003},
issn = {1362-4962},
journal = {Nucleic acids research},
mendeley-groups = {Thesis try 2},
month = {jan},
number = {D1},
pages = {D759--D765},
pmid = {30364959},
title = {{ FlyBase 2.0: the next generation. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/30364959 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC6323960},
volume = {47},
year = {2019}
}
@article{Cherry2012,
abstract = {The Saccharomyces Genome Database (SGD, http://www.yeastgenome.org) is the community resource for the budding yeast Saccharomyces cerevisiae. The SGD project provides the highest-quality manually curated information from peer-reviewed literature. The experimental results reported in the literature are extracted and integrated within a well-developed database. These data are combined with quality high-throughput results and provided through Locus Summary pages, a powerful query engine and rich genome browser. The acquisition, integration and retrieval of these data allow SGD to facilitate experimental design and analysis by providing an encyclopedia of the yeast genome, its chromosomal features, their functions and interactions. Public access to these data is provided to researchers and educators via web pages designed for optimal ease of use.},
author = {Cherry, J Michael and Hong, Eurie L and Amundsen, Craig and Balakrishnan, Rama and Binkley, Gail and Chan, Esther T and Christie, Karen R and Costanzo, Maria C and Dwight, Selina S and Engel, Stacia R and Fisk, Dianna G and Hirschman, Jodi E and Hitz, Benjamin C and Karra, Kalpana and Krieger, Cynthia J and Miyasato, Stuart R and Nash, Rob S and Park, Julie and Skrzypek, Marek S and Simison, Matt and Weng, Shuai and Wong, Edith D},
doi = {10.1093/nar/gkr1029},
issn = {1362-4962},
journal = {Nucleic acids research},
mendeley-groups = {Thesis try 2},
month = {jan},
number = {Database issue},
pages = {D700--5},
pmid = {22110037},
title = {{ Saccharomyces Genome Database: the genomics resource of budding yeast. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/22110037 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC3245034},
volume = {40},
year = {2012}
}
@article{Karimi2018,
author = {Karimi, Kamran and Fortriede, Joshua D and Lotay, Vaneet S and Burns, Kevin A and Wang, Dong Zhou and Fisher, Malcom E and Pells, Troy J and James-Zorn, Christina and Wang, Ying and Ponferrada, V G and Chu, Stanley and Chaturvedi, Praneet and Zorn, Aaron M and Vize, Peter D},
doi = {10.1093/nar/gkx936},
issn = {0305-1048},
journal = {Nucleic Acids Research},
mendeley-groups = {Thesis try 2},
month = {jan},
number = {D1},
pages = {D861--D868},
title = {{ Xenbase: a genomic, epigenomic and transcriptomic model organism database }},
url = {http://academic.oup.com/nar/article/46/D1/D861/4559118},
volume = {46},
year = {2018}
}
@article{Bult2019,
abstract = {The Mouse Genome Database (MGD; http://www.informatics.jax.org) is the community model organism genetic and genome resource for the laboratory mouse. MGD is the authoritative source for biological reference data sets related to mouse genes, gene functions, phenotypes, and mouse models of human disease. MGD is the primary outlet for official gene, allele and mouse strain nomenclature based on the guidelines set by the International Committee on Standardized Nomenclature for Mice. In this report we describe significant enhancements to MGD, including two new graphical user interfaces: (i) the Multi Genome Viewer for exploring the genomes of multiple mouse strains and (ii) the Phenotype-Gene Expression matrix which was developed in collaboration with the Gene Expression Database (GXD) and allows researchers to compare gene expression and phenotype annotations for mouse genes. Other recent improvements include enhanced efficiency of our literature curation processes and the incorporation of Transcriptional Start Site (TSS) annotations from RIKEN's FANTOM 5 initiative.},
author = {Bult, Carol J and Blake, Judith A and Smith, Cynthia L and Kadin, James A and Richardson, Joel E and { Mouse Genome Database Group }},
doi = {10.1093/nar/gky1056},
issn = {1362-4962},
journal = {Nucleic acids research},
mendeley-groups = {Thesis try 2},
month = {jan},
number = {D1},
pages = {D801--D806},
pmid = {30407599},
title = {{ Mouse Genome Database (MGD) 2019. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/30407599 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC6323923},
volume = {47},
year = {2019}
}
@article{Bachman2018,
abstract = {For automated reading of scientific publications to extract useful information about molecular mechanisms it is critical that genes, proteins and other entities be correctly associated with uniform identifiers, a process known as named entity linking or “grounding.” Correct grounding is essential for resolving relationships among mined information, curated interaction databases, and biological datasets. The accuracy of this process is largely dependent on the availability of machine-readable resources associating synonyms and abbreviations commonly found in biomedical literature with uniform identifiers. In a task involving automated reading of ∼215,000 articles using the REACH event extraction software we found that grounding was disproportionately inaccurate for multi-protein families (e.g., “AKT”) and complexes with multiple subunits (e.g.“NF- $\kappa$B”). To address this problem we constructed FamPlex, a manually curated resource defining protein families and complexes as they are commonly encountered in biomedical text. In FamPlex the gene-level constituents of families and complexes are defined in a flexible format allowing for multi-level, hierarchical membership. To create FamPlex, text strings corresponding to entities were identified empirically from literature and linked manually to uniform identifiers; these identifiers were also mapped to equivalent entries in multiple related databases. FamPlex also includes curated prefix and suffix patterns that improve named entity recognition and event extraction. Evaluation of REACH extractions on a test corpus of ∼54,000 articles showed that FamPlex significantly increased grounding accuracy for families and complexes (from 15 to 71 { \% } ). The hierarchical organization of entities in FamPlex also made it possible to integrate otherwise unconnected mechanistic information across families, subfamilies, and individual proteins. Applications of FamPlex to the TRIPS/DRUM reading system and the Biocreative VI Bioentity Normalization Task dataset demonstrated the utility of FamPlex in other settings. FamPlex is an effective resource for improving named entity recognition, grounding, and relationship resolution in automated reading of biomedical text. The content in FamPlex is available in both tabular and Open Biomedical Ontology formats at
https://github.com/sorgerlab/famplex
under the Creative Commons CC0 license and has been integrated into the TRIPS/DRUM and REACH reading systems.},
author = {Bachman, John A. and Gyori, Benjamin M. and Sorger, Peter K.},
doi = {10.1186/s12859-018-2211-5},
file = {:Users/cthoyt/ownCloud/Mendeley/2018/FamPlex A resource for entity recognition and relationship resolution of human protein families and complexes in biomedical text mining.pdf:pdf},
isbn = {14712105 (Electronic)},
issn = {14712105},
journal = {BMC Bioinformatics},
keywords = {Biocuration,Event extraction,Grounding,Named entity linking,Named entity recognition,Natural language processing,Protein families,Text mining},
mendeley-groups = {Thesis try 2},
number = {1},
pages = {1--14},
pmid = {29954318},
publisher = {BMC Bioinformatics},
title = {{ FamPlex: A resource for entity recognition and relationship resolution of human protein families and complexes in biomedical text mining }},
volume = {19},
year = {2018}
}
@article{Hobbs1978,
abstract = {Two approaches to the problem of resolving pronoun references are presented. The first is a naive algorithm that works by traversing the surface parse trees of the sentences of the text in a particular order looking for noun phrases of the correct gender and number. The algorithm clearly does not work in all cases, but the results of an examination of several hundred examples from published texts show that it performs remarkably well. In the second approach, it is shown how pronoun solution can be handled in a comprehensive system for semantic analysis of English texts. The system is described, and it is shown in a detailed treatment of several examples how semantic analysis locates the antecedents of most pronouns as a by-product. Included are the classic examples of Winograd and Charniak.},
author = {Hobbs, Jerry R},
doi = {https://doi.org/10.1016/0024-3841(78)90006-2},
issn = {0024-3841},
journal = {Lingua},
mendeley-groups = {Thesis try 2},
number = {4},
pages = {311--338},
title = {{ Resolving pronoun references }},
url = {http://www.sciencedirect.com/science/article/pii/0024384178900062},
volume = {44},
year = {1978}
}
@inproceedings{Brennan1987,
author = {Brennan, Susan E. and Friedman, Marilyn W. and Pollard, Carl J.},
title = {A Centering Approach to Pronouns},
booktitle = {Proceedings of the 25th Annual Meeting on Association for Computational Linguistics},
series = {ACL '87},
year = {1987},
location = {Stanford, California},
pages = {155--162},
numpages = {8},
url = {https://doi.org/10.3115/981175.981197},
doi = {10.3115/981175.981197},
acmid = {981197},
publisher = {Association for Computational Linguistics},
address = {Stroudsburg, PA, USA},
}
@article{Lappin1994,
author = {Lappin, Shalom and Leass, Herbert J.},
title = {An Algorithm for Pronominal Anaphora Resolution},
journal = {Comput. Linguist.},
issue_date = {December 1994},
volume = {20},
number = {4},
month = dec,
year = {1994},
issn = {0891-2017},
pages = {535--561},
numpages = {27},
url = {http://dl.acm.org/citation.cfm?id=203987.203989},
acmid = {203989},
publisher = {MIT Press},
address = {Cambridge, MA, USA},
}
@article{Soon2001,
author = {Soon, Wee Meng and Ng, Hwee Tou and Lim, Daniel Chung Yong},
title = {A Machine Learning Approach to Coreference Resolution of Noun Phrases},
journal = {Comput. Linguist.},
issue_date = {December 2001},
volume = {27},
number = {4},
month = dec,
year = {2001},
issn = {0891-2017},
pages = {521--544},
numpages = {24},
url = {http://dl.acm.org/citation.cfm?id=972597.972602},
acmid = {972602},
publisher = {MIT Press},
address = {Cambridge, MA, USA},
}
@inproceedings{Ng2002,
author = {Ng, Vincent and Cardie, Claire},
title = {Identifying Anaphoric and Non-anaphoric Noun Phrases to Improve Coreference Resolution},
booktitle = {Proceedings of the 19th International Conference on Computational Linguistics - Volume 1},
series = {COLING '02},
year = {2002},
location = {Taipei, Taiwan},
pages = {1--7},
numpages = {7},
url = {https://doi.org/10.3115/1072228.1072367},
doi = {10.3115/1072228.1072367},
acmid = {1072367},
publisher = {Association for Computational Linguistics},
address = {Stroudsburg, PA, USA},
}
@inproceedings{Bengtson2008,
author = {Bengtson, Eric and Roth, Dan},
title = {Understanding the Value of Features for Coreference Resolution},
booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing},
series = {EMNLP '08},
year = {2008},
location = {Honolulu, Hawaii},
pages = {294--303},
numpages = {10},
url = {http://dl.acm.org/citation.cfm?id=1613715.1613756},
acmid = {1613756},
publisher = {Association for Computational Linguistics},
address = {Stroudsburg, PA, USA},
}
@inproceedings{Luo2004,
author = {Luo, Xiaoqiang and Ittycheriah, Abe and Jing, Hongyan and Kambhatla, Nanda and Roukos, Salim},
title = {A Mention-synchronous Coreference Resolution Algorithm Based on the Bell Tree},
booktitle = {Proceedings of the 42Nd Annual Meeting on Association for Computational Linguistics},
series = {ACL '04},
year = {2004},
location = {Barcelona, Spain},
articleno = {135},
url = {https://doi.org/10.3115/1218955.1218973},
doi = {10.3115/1218955.1218973},
acmid = {1218973},
publisher = {Association for Computational Linguistics},
address = {Stroudsburg, PA, USA},
}
@inproceedings{Yang2004,
author = {Yang, Xiaofeng and Su, Jian and Zhou, Guodong and Tan, Chew Lim},
title = {An NP-cluster Based Approach to Coreference Resolution},
booktitle = {Proceedings of the 20th International Conference on Computational Linguistics},
series = {COLING '04},
year = {2004},
location = {Geneva, Switzerland},
articleno = {226},
url = {https://doi.org/10.3115/1220355.1220388},
doi = {10.3115/1220355.1220388},
acmid = {1220388},
publisher = {Association for Computational Linguistics},
address = {Stroudsburg, PA, USA},
}
@inproceedings{Yang2008,
author = {Yang, Xiaofeng and Su, Jian and Lang, Jun and Tan, Chew Lim and Liu, Ting and Li, Sheng},
title = {An Entity-Mention Model for Coreference Resolution with Inductive Logic Programming},
booktitle = {Proceedings of ACL-08: HLT},
month = {June},
year = {2008},
address = {Columbus, Ohio},
publisher = {Association for Computational Linguistics},
pages = {843--851},
url = {http://www.aclweb.org/anthology/P/P08/P08-1096}
}
@inproceedings{Lee2011,
author = {Lee, Heeyoung and Peirsman, Yves and Chang, Angel and Chambers, Nathanael and Surdeanu, Mihai and Jurafsky, Dan},
title = {Stanford's Multi-pass Sieve Coreference Resolution System at the CoNLL-2011 Shared Task},
booktitle = {Proceedings of the Fifteenth Conference on Computational Natural Language Learning: Shared Task},
series = {CONLL Shared Task '11},
year = {2011},
isbn = {9781937284084},
location = {Portland, Oregon},
pages = {28--34},
numpages = {7},
url = {http://dl.acm.org/citation.cfm?id=2132936.2132938},
acmid = {2132938},
publisher = {Association for Computational Linguistics},
address = {Stroudsburg, PA, USA},
}
@inproceedings{Denis2007,
author = {Denis, Pascal and Baldridge, Jason},
title = {A Ranking Approach to Pronoun Resolution},
booktitle = {Proceedings of the 20th International Joint Conference on Artifical Intelligence},
series = {IJCAI'07},
year = {2007},
location = {Hyderabad, India},
pages = {1588--1593},
numpages = {6},
url = {http://dl.acm.org/citation.cfm?id=1625275.1625532},
acmid = {1625532},
publisher = {Morgan Kaufmann Publishers Inc.},
address = {San Francisco, CA, USA},
}
@inproceedings{Rahman2009,
author = {Rahman, Altaf and Ng, Vincent},
title = {Supervised Models for Coreference Resolution},
booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 2 - Volume 2},
series = {EMNLP '09},
year = {2009},
isbn = {978-1-932432-62-6},
location = {Singapore},
pages = {968--977},
numpages = {10},
url = {http://dl.acm.org/citation.cfm?id=1699571.1699639},
acmid = {1699639},
publisher = {Association for Computational Linguistics},
address = {Stroudsburg, PA, USA},
}
@article{martschat2015,
title = {Latent Structures for Coreference Resolution},
author = "Martschat, Sebastian and Strube, Michael",
journal = "Transactions of the Association for Computational Linguistics",
volume = "3",
year = "2015",
url = "https://www.aclweb.org/anthology/Q15-1029",
doi = "10.1162/tacl_a_00147",
pages = "405--418",
abstract = {Machine learning approaches to coreference resolution vary greatly in the modeling of the problem: while early approaches operated on the mention pair level, current research focuses on ranking architectures and antecedent trees. We propose a unified representation of different approaches to coreference resolution in terms of the structure they operate on. We represent several coreference resolution approaches proposed in the literature in our framework and evaluate their performance. Finally, we conduct a systematic analysis of the output of these approaches, highlighting differences and similarities.},
}
@article{Rahman2011,
author = {Rahman, Altaf and Ng, Vincent},
title = {Narrowing the Modeling Gap: A Cluster-ranking Approach to Coreference Resolution},
journal = {J. Artif. Int. Res.},
issue_date = {January 2011},
volume = {40},
number = {1},
month = jan,
year = {2011},
issn = {1076-9757},
pages = {469--521},
numpages = {53},
url = {http://dl.acm.org/citation.cfm?id=2016945.2016958},
acmid = {2016958},
publisher = {AI Access Foundation},
address = {USA},
}
@inproceedings{Ma2014,
address = {Stroudsburg, PA, USA},
author = {Ma, Chao and Doppa, Janardhan Rao and Orr, J. Walker and Mannem, Prashanth and Fern, Xiaoli and Dietterich, Tom and Tadepalli, Prasad},
booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
doi = {10.3115/v1/D14-1225},
pages = {2115--2126},
publisher = {Association for Computational Linguistics},
title = {{ Prune-and-Score: Learning for Greedy Coreference Resolution }},
url = {http://aclweb.org/anthology/D14-1225},
year = {2014}
}
@inproceedings{Clark2016,
title = "Improving Coreference Resolution by Learning Entity-Level Distributed Representations",
author = {Clark, Kevin and Manning, Christopher D.},
booktitle = "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2016",
address = "Berlin, Germany",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P16-1061",
doi = "10.18653/v1/P16-1061",
pages = "643--653",
}
@article{Li2018,
author = {Li, Chen and Rao, Zhiqiang and Zheng, Qinghua and Zhang, Xiangrong},
title = "{A set of domain rules and a deep network for protein coreference resolution}",
journal = {Database},
volume = {2018},
year = {2018},
month = {07},
issn = {1758-0463},
doi = {10.1093/database/bay065},
url = {https://doi.org/10.1093/database/bay065},
eprint = {http://oup.prod.sis.lan/database/article-pdf/doi/10.1093/database/bay065/27438328/bay065.pdf},
}
@article{Giorgi526244,
author = {Giorgi, John and Bader, Gary},
title = {Towards reliable named entity recognition in the biomedical domain},
elocation-id = {526244},
year = {2019},
doi = {10.1101/526244},
publisher = {Cold Spring Harbor Laboratory},
URL = {https://www.biorxiv.org/content/early/2019/01/22/526244},
eprint = {https://www.biorxiv.org/content/early/2019/01/22/526244.full.pdf},
journal = {bioRxiv}
}
@article{Hakenberg2011,
abstract = {SUMMARY Identifying mentions of named entities, such as genes or diseases, and normalizing them to database identifiers have become an important step in many text and data mining pipelines. Despite this need, very few entity normalization systems are publicly available as source code or web services for biomedical text mining. Here we present the Gnat Java library for text retrieval, named entity recognition, and normalization of gene and protein mentions in biomedical text. The library can be used as a component to be integrated with other text-mining systems, as a framework to add user-specific extensions, and as an efficient stand-alone application for the identification of gene and protein names for data analysis. On the BioCreative III test data, the current version of Gnat achieves a Tap-20 score of 0.1987. AVAILABILITY The library and web services are implemented in Java and the sources are available from http://gnat.sourceforge.net. CONTACT [email protected].},
author = {Hakenberg, J { \" { o } } rg and Gerner, Martin and Haeussler, Maximilian and Solt, Ill { \' { e } } s and Plake, Conrad and Schroeder, Michael and Gonzalez, Graciela and Nenadic, Goran and Bergman, Casey M},
doi = {10.1093/bioinformatics/btr455},
issn = {1367-4811},
journal = {Bioinformatics (Oxford, England)},
mendeley-groups = {Thesis try 2},
month = {oct},
number = {19},
pages = {2769--71},
pmid = {21813477},
title = {{ The GNAT library for local and remote gene mention normalization. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21813477 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC3179658},
volume = {27},
year = {2011}
}
@article{Hsu2008,
abstract = {MOTIVATION Tagging gene and gene product mentions in scientific text is an important initial step of literature mining. In this article, we describe in detail our gene mention tagger participated in BioCreative 2 challenge and analyze what contributes to its good performance. Our tagger is based on the conditional random fields model (CRF), the most prevailing method for the gene mention tagging task in BioCreative 2. Our tagger is interesting because it accomplished the highest F-scores among CRF-based methods and second over all. Moreover, we obtained our results by mostly applying open source packages, making it easy to duplicate our results. RESULTS We first describe in detail how we developed our CRF-based tagger. We designed a very high dimensional feature set that includes most of information that may be relevant. We trained bi-directional CRF models with the same set of features, one applies forward parsing and the other backward, and integrated two models based on the output scores and dictionary filtering. One of the most prominent factors that contributes to the good performance of our tagger is the integration of an additional backward parsing model. However, from the definition of CRF, it appears that a CRF model is symmetric and bi-directional parsing models will produce the same results. We show that due to different feature settings, a CRF model can be asymmetric and the feature setting for our tagger in BioCreative 2 not only produces different results but also gives backward parsing models slight but constant advantage over forward parsing model. To fully explore the potential of integrating bi-directional parsing models, we applied different asymmetric feature settings to generate many bi-directional parsing models and integrate them based on the output scores. Experimental results show that this integrated model can achieve even higher F-score solely based on the training corpus for gene mention tagging. AVAILABILITY Data sets, programs and an on-line service of our gene mention tagger can be accessed at http://aiia.iis.sinica.edu.tw/biocreative2.htm.},
author = {Hsu, Chun-Nan and Chang, Yu-Ming and Kuo, Cheng-Ju and Lin, Yu-Shi and Huang, Han-Shen and Chung, I-Fang},
doi = {10.1093/bioinformatics/btn183},
issn = {1367-4811},
journal = {Bioinformatics (Oxford, England)},
mendeley-groups = {Thesis try 2},
month = {jul},
number = {13},
pages = {i286--94},
pmid = {18586726},
title = {{ Integrating high dimensional bi-directional parsing models for gene mention tagging. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/18586726 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC2718659},
volume = {24},
year = {2008}
}
@article{Leaman2008,
abstract = {There has been an increasing amount of research on biomedical named entity recognition, the most basic text extraction problem, resulting in significant progress by different research teams around the world. This has created a need for a freely-available, open source system implementing the advances described in the literature. In this paper we present BANNER, an open-source, executable survey of advances in biomedical named entity recognition, intended to serve as a benchmark for the field. BANNER is implemented in Java as a machine-learning system based on conditional random fields and includes a wide survey of the best techniques recently described in the literature. It is designed to maximize domain independence by not employing brittle semantic features or rule-based processing steps, and achieves significantly better performance than existing baseline systems. It is therefore useful to developers as an extensible NER implementation, to researchers as a standard for comparing innovative techniques, and to biologists requiring the ability to find novel entities in large amounts of text.},
author = {Leaman, Robert and Gonzalez, Graciela},
issn = {2335-6928},
journal = {Pacific Symposium on Biocomputing. Pacific Symposium on Biocomputing},
mendeley-groups = {Thesis try 2},
pages = {652--63},
pmid = {18229723},
title = {{ BANNER: an executable survey of advances in biomedical named entity recognition. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/18229723},
year = {2008}
}
@article{Wei2015,
abstract = {The automatic recognition of gene names and their associated database identifiers from biomedical text has been widely studied in recent years, as these tasks play an important role in many downstream text-mining applications. Despite significant previous research, only a small number of tools are publicly available and these tools are typically restricted to detecting only mention level gene names or only document level gene identifiers. In this work, we report GNormPlus: an end-to-end and open source system that handles both gene mention and identifier detection. We created a new corpus of 694 PubMed articles to support our development of GNormPlus, containing manual annotations for not only gene names and their identifiers, but also closely related concepts useful for gene name disambiguation, such as gene families and protein domains. GNormPlus integrates several advanced text-mining techniques, including SimConcept for resolving composite gene names. As a result, GNormPlus compares favorably to other state-of-the-art methods when evaluated on two widely used public benchmarking datasets, achieving 86.7 { \% } F1-score on the BioCreative II Gene Normalization task dataset and 50.1 { \% } F1-score on the BioCreative III Gene Normalization task dataset. The GNormPlus source code and its annotated corpus are freely available, and the results of applying GNormPlus to the entire PubMed are freely accessible through our web-based tool PubTator.},
author = {Wei, Chih-Hsuan and Kao, Hung-Yu and Lu, Zhiyong},
doi = {10.1155/2015/918710},
issn = {2314-6141},
journal = {BioMed research international},
mendeley-groups = {Thesis try 2},
pages = {918710},
pmid = {26380306},
title = {{ GNormPlus: An Integrative Approach for Tagging Genes, Gene Families, and Protein Domains. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/26380306 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC4561873},
volume = {2015},
year = {2015}
}
@article{Leaman2015,
abstract = {Chemical compounds and drugs are an important class of entities in biomedical research with great potential in a wide range of applications, including clinical medicine. Locating chemical named entities in the literature is a useful step in chemical text mining pipelines for identifying the chemical mentions, their properties, and their relationships as discussed in the literature. We introduce the tmChem system, a chemical named entity recognizer created by combining two independent machine learning models in an ensemble. We use the corpus released as part of the recent CHEMDNER task to develop and evaluate tmChem, achieving a micro-averaged f-measure of 0.8739 on the CEM subtask (mention-level evaluation) and 0.8745 f-measure on the CDI subtask (abstract-level evaluation). We also report a high-recall combination (0.9212 for CEM and 0.9224 for CDI). tmChem achieved the highest f-measure reported in the CHEMDNER task for the CEM subtask, and the high recall variant achieved the highest recall on both the CEM and CDI tasks. We report that tmChem is a state-of-the-art tool for chemical named entity recognition and that performance for chemical named entity recognition has now tied (or exceeded) the performance previously reported for genes and diseases. Future research should focus on tighter integration between the named entity recognition and normalization steps for improved performance. The source code and a trained model for both models of tmChem is available at: http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmChem. The results of running tmChem (Model 2) on PubMed are available in PubTator: http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator.},
author = {Leaman, Robert and Wei, Chih-Hsuan and Lu, Zhiyong},
doi = {10.1186/1758-2946-7-S1-S3},
issn = {1758-2946},
journal = {Journal of cheminformatics},
mendeley-groups = {Thesis try 2},
number = {Suppl 1 Text mining for chemistry and the CHEMDNER track},
pages = {S3},
pmid = {25810774},
title = {{ tmChem: a high performance approach for chemical named entity recognition and normalization. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/25810774 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC4331693},
volume = {7},
year = {2015}
}
@article{Corbett2018,
abstract = {Chemical named entity recognition (NER) has traditionally been dominated by conditional random fields (CRF)-based approaches but given the success of the artificial neural network techniques known as ``deep learning'' we decided to examine them as an alternative to CRFs. We present here several chemical named entity recognition systems. The first system translates the traditional CRF-based idioms into a deep learning framework, using rich per-token features and neural word embeddings, and producing a sequence of tags using bidirectional long short term memory (LSTM) networks---a type of recurrent neural net. The second system eschews the rich feature set---and even tokenisation---in favour of character labelling using neural character embeddings and multiple LSTM layers. The third system is an ensemble that combines the results of the first two systems. Our original BioCreative V.5 competition entry was placed in the top group with the highest F scores, and subsequent using transfer learning have achieved a final F score of 90.33 { \% } on the test data (precision 91.47 { \% } , recall 89.21 { \% } ).},
author = {Corbett, Peter and Boyle, John},
doi = {10.1186/s13321-018-0313-8},
file = {:Users/cthoyt/ownCloud/Mendeley/2018/Chemlistem chemical named entity recognition using recurrent neural networks - 2018 - Corbett, Boyle.pdf:pdf},
issn = {1758-2946},
journal = {Journal of Cheminformatics},
month = {dec},
number = {1},
pages = {59},
title = {{ Chemlistem: chemical named entity recognition using recurrent neural networks }},
url = {https://doi.org/10.1186/s13321-018-0313-8},
volume = {10},
year = {2018}
}
@article{Leaman2013,
abstract = {MOTIVATION Despite the central role of diseases in biomedical research, there have been much fewer attempts to automatically determine which diseases are mentioned in a text-the task of disease name normalization (DNorm)-compared with other normalization tasks in biomedical text mining research. METHODS In this article we introduce the first machine learning approach for DNorm, using the NCBI disease corpus and the MEDIC vocabulary, which combines MeSH { \textregistered } and OMIM. Our method is a high-performing and mathematically principled framework for learning similarities between mentions and concept names directly from training data. The technique is based on pairwise learning to rank, which has not previously been applied to the normalization task but has proven successful in large optimization problems for information retrieval. RESULTS We compare our method with several techniques based on lexical normalization and matching, MetaMap and Lucene. Our algorithm achieves 0.782 micro-averaged F-measure and 0.809 macro-averaged F-measure, an increase over the highest performing baseline method of 0.121 and 0.098, respectively. AVAILABILITY The source code for DNorm is available at http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/DNorm, along with a web-based demonstration and links to the NCBI disease corpus. Results on PubMed abstracts are available in PubTator: http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator .},
author = {Leaman, Robert and { Islamaj Dogan } , Rezarta and Lu, Zhiyong},
doi = {10.1093/bioinformatics/btt474},
issn = {1367-4811},
journal = {Bioinformatics (Oxford, England)},
month = {nov},
number = {22},
pages = {2909--17},
pmid = {23969135},
title = {{ DNorm: disease name normalization with pairwise learning to rank. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/23969135 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC3810844},
volume = {29},
year = {2013}
}
@article{Gerner2010,
abstract = {BACKGROUND: The task of recognizing and identifying species names in biomedical literature has recently been regarded as critical for a number of applications in text and data mining, including gene name recognition, species-specific document retrieval, and semantic enrichment of biomedical articles. RESULTS: In this paper we describe an open-source species name recognition and normalization software system, LINNAEUS, and evaluate its performance relative to several automatically generated biomedical corpora, as well as a novel corpus of full-text documents manually annotated for species mentions. LINNAEUS uses a dictionary-based approach (implemented as an efficient deterministic finite-state automaton) to identify species names and a set of heuristics to resolve ambiguous mentions. When compared against our manually annotated corpus, LINNAEUS performs with 94 { \% } recall and 97 { \% } precision at the mention level, and 98 { \% } recall and 90 { \% } precision at the document level. Our system successfully solves the problem of disambiguating uncertain species mentions, with 97 { \% } of all mentions in PubMed Central full-text documents resolved to unambiguous NCBI taxonomy identifiers. CONCLUSIONS: LINNAEUS is an open source, stand-alone software system capable of recognizing and normalizing species name mentions with speed and accuracy, and can therefore be integrated into a range of bioinformatics and text-mining applications. The software and manually annotated corpus can be downloaded freely at http://linnaeus.sourceforge.net/.},
author = {Gerner, Martin and Nenadic, Goran and Bergman, Casey M.},
doi = {10.1186/1471-2105-11-85},
file = {:Users/cthoyt/ownCloud/Mendeley/2010/LINNAEUS A species name identification system for biomedical literature - 2010 - Gerner, Nenadic, Bergman.pdf:pdf},
issn = {14712105},
journal = {BMC Bioinformatics},
title = {{ LINNAEUS: A species name identification system for biomedical literature }},
volume = {11},
year = {2010}
}
@article{Wei2012,
abstract = {As suggested in recent studies, species recognition and disambiguation is one of the most critical and challenging steps in many downstream text-mining applications such as the gene normalization task and protein-protein interaction extraction. We report SR4GN: an open source tool for species recognition and disambiguation in biomedical text. In addition to the species detection function in existing tools, SR4GN is optimized for the Gene Normalization task. As such it is developed to link detected species with corresponding gene mentions in a document. SR4GN achieves 85.42 { \% } in accuracy and compares favorably to the other state-of-the-art techniques in benchmark experiments. Finally, SR4GN is implemented as a standalone software tool, thus making it convenient and robust for use in many text-mining applications. SR4GN can be downloaded at: http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/downloads/SR4GN.},
author = {Wei, Chih Hsuan and Kao, Hung Yu and Lu, Zhiyong},
doi = {10.1371/journal.pone.0038460},
file = {:Users/cthoyt/ownCloud/Mendeley/2012/SR4GN A Species Recognition Software Tool for Gene Normalization - 2012 - Wei, Kao, Lu.PDF:PDF},
issn = {19326203},
journal = {PLoS ONE},
number = {6},
pages = {7--11},
title = {{ SR4GN: A species recognition software tool for gene normalization }},
volume = {7},
year = {2012}
}
@article{Lee2015,
abstract = {Disease plays a central role in many areas of biomedical research and healthcare. However, the rapid growth of disease and treatment research creates barriers to the knowledge aggregation of PubMed database. Thus, a framework of disease mention recognition and normalization has become increasingly important for biomedical text mining. In this work, we utilize conditional random fields (CRFs) to develop a recognition system and optimize the results by customizing several post-processing steps, such as abbreviation resolution and consistency improvement. At the DNER subtask of BioCreative V CDR task, the system performance of disease normalization is 0.8646 of F-measure, especially a high precision (0.8963) on the normalization task.},
author = {Lee, Hsin-Chun and Hsu, Yi-Yu and Kao, Hung-Yu},
file = {:Users/cthoyt/ownCloud/Mendeley/2015/An enhanced CRF-based system for disease name entity recognition and normalization on BioCreative V DNER Task - 2015 - Lee, Hsu, Kao.pdf:pdf},
journal = {Proceedings of the Fifth BioCreative Challenge Evaluation Workshop},
keywords = {and normalization,biomedical text mining,conditional,disease name entity recognition,random fields},
pages = {226--233},
title = {{ An enhanced CRF-based system for disease name entity recognition and normalization on BioCreative V DNER Task }},
year = {2015}
}
@article{Davis2012,
abstract = {The Comparative Toxicogenomics Database (CTD) is a public resource that promotes understanding about the effects of environmental chemicals on human health. CTD biocurators manually curate a triad of chemical-gene, chemical-disease and gene-disease relationships from the scientific literature. The CTD curation paradigm uses controlled vocabularies for chemicals, genes and diseases. To curate disease information, CTD first had to identify a source of controlled terms. Two resources seemed to be good candidates: the Online Mendelian Inheritance in Man (OMIM) and the 'Diseases' branch of the National Library of Medicine's Medical Subject Headers (MeSH). To maximize the advantages of both, CTD biocurators undertook a novel initiative to map the flat list of OMIM disease terms into the hierarchical nature of the MeSH vocabulary. The result is CTD's 'merged disease vocabulary' (MEDIC), a unique resource that integrates OMIM terms, synonyms and identifiers with MeSH terms, synonyms, definitions, identifiers and hierarchical relationships. MEDIC is both a deep and broad vocabulary, composed of 9700 unique diseases described by more than 67 000 terms (including synonyms). It is freely available to download in various formats from CTD. While neither a true ontology nor a perfect solution, this vocabulary has nonetheless proved to be extremely successful and practical for our biocurators in generating over 2.5 million disease-associated toxicogenomic relationships in CTD. Other external databases have also begun to adopt MEDIC for their disease vocabulary. Here, we describe the construction, implementation, maintenance and use of MEDIC to raise awareness of this resource and to offer it as a putative scaffold in the formal construction of an official disease ontology. DATABASE URL: http://ctd.mdibl.org/voc.go?type=disease.},
author = {Davis, Allan Peter and Wiegers, Thomas C. and Rosenstein, Michael C. and Mattingly, Carolyn J.},
doi = {10.1093/database/bar065},
issn = {17580463},
journal = {Database},
pages = {1--9},
title = {{ MEDIC: A practical disease vocabulary used at the comparative toxicogenomics database }},
volume = {2012},
year = {2012}
}
@article{Kuo2009,
abstract = {Background: To automatically process large quantities of biological literature for knowledge discovery and information curation, text mining tools are becoming essential. Abbreviation recognition is related to NER and can be considered as a pair recognition task of a terminology and its corresponding abbreviation from free text. The successful identification of abbreviation and its corresponding definition is not only a prerequisite to index terms of text databases to produce articles of related interests, but also a building block to improve existing gene mention tagging and gene normalization tools. Results: Our approach to abbreviation recognition (AR) is based on machine-learning, which exploits a novel set of rich features to learn rules from training data. Tested on the AB3P corpus, our system demonstrated a F-score of 89.90 { \% } with 95.86 { \% } precision at 84.64 { \% } recall, higher than the result achieved by the existing best AR performance system. We also annotated a new corpus of 1200 PubMed abstracts which was derived from BioCreative II gene normalization corpus. On our annotated corpus, our system achieved a F-score of 86.20 { \% } with 93.52 { \% } precision at 79.95 { \% } recall, which also outperforms all tested systems. Conclusion: By applying our system to extract all short form-long form pairs from all available PubMed abstracts, we have constructed BIOADI. Mining BIOADI reveals many interesting trends of bio-medical research. Besides, we also provide an off-line AR software in the download section on http://bioagent.iis.sinica.edu.tw/BIOADI/. ? 2009 Kuo et al; licensee BioMed Central Ltd.},
author = {Kuo, Cheng Ju and Ling, Maurice H.T. and Lin, Kuan Ting and Hsu, Chun Nan},
doi = {10.1186/1471-2105-10-S15-S7},
file = {:Users/cthoyt/ownCloud/Mendeley/2009/and definitions in biological literature - 2009 - Kuo et al.pdf:pdf},
isbn = {1471210510},
issn = {14712105},
journal = {BMC Bioinformatics},
number = {SUPPL. 15},
pages = {1--10},
title = {{ BIOADI: A machine learning approach to identifying abbreviations and definitions in biological literature }},
volume = {10},
year = {2009}
}
@article{Mikolov2013,
archivePrefix = {arXiv},
arxivId = {1301.3781v3},
author = {Mikolov, Tomas and Corrado, Greg and Chen, Kai and Dean, Jeffrey},
eprint = {1301.3781v3},
file = {:Users/cthoyt/ownCloud/Mendeley/2013/Efficient Estimation of Word Representations in Vector Space - 2013 - Mikolov et al.pdf:pdf},
pages = {1--12},
title = {{ Efficient Estimation of Word Representations in Vector Space }},
year = {2013}
}
@article{Pennington2014,
author = {{ Jeffrey Pennington } , Jeffrey and Socher, Richard and Manning, Christopher D.},
file = {:Users/cthoyt/ownCloud/Mendeley/2014/Glove Global vectors for word representation - 2014 - Jeffrey Pennington, Socher, Manning.pdf:pdf},
journal = {In EMNLP},
title = {{ Glove: Global vectors for word representation }},
year = {2014}
}
@article{Lample2016,
abstract = {State-of-the-art named entity recognition systems rely heavily on hand-crafted features and domain-specific knowledge in order to learn effectively from the small, supervised training corpora that are available. In this paper, we introduce two new neural architectures---one based on bidirectional LSTMs and conditional random fields, and the other that constructs and labels segments using a transition-based approach inspired by shift-reduce parsers. Our models rely on two sources of information about words: character-based word representations learned from the supervised corpus and unsupervised word representations learned from unannotated corpora. Our models obtain state-of-the-art performance in NER in four languages without resorting to any language-specific knowledge or resources such as gazetteers.},
archivePrefix = {arXiv},
arxivId = {1603.01360},
author = {Lample, Guillaume and Ballesteros, Miguel and Subramanian, Sandeep and Kawakami, Kazuya and Dyer, Chris},
eprint = {1603.01360},
file = {:Users/cthoyt/ownCloud/Mendeley/2016/Neural Architectures for Named Entity Recognition - 2016 - Lample et al.pdf:pdf},
pages = {260--270},
title = {{ Neural Architectures for Named Entity Recognition }},
url = {http://arxiv.org/abs/1603.01360},
year = {2016}
}
@article{Kim2003,
abstract = {MOTIVATION: Natural language processing (NLP) methods are regarded as being useful to raise the potential of text mining from biological literature. The lack of an extensively annotated corpus of this literature, however, causes a major bottleneck for applying NLP techniques. GENIA corpus is being developed to provide reference materials to let NLP techniques work for bio-textmining. RESULTS: GENIA corpus version 3.0 consisting of 2000 MEDLINE abstracts has been released with more than 400,000 words and almost 100,000 annotations for biological terms.},
author = {Kim, J. D. and Ohta, T. and Tateisi, Y. and Tsujii, J.},
doi = {10.1093/bioinformatics/btg1023},
file = {:Users/cthoyt/ownCloud/Mendeley/2003/for bio-textmining - 2003 - Ohta, Tateisi, Tsujii.pdf:pdf},
issn = {13674803},
journal = {Bioinformatics},
keywords = {Computational Molecular Biology,Corpus,Information Extraction,Natural Language Processing,Text Mining},
number = {SUPPL. 1},
pages = {180--182},
title = {{ GENIA corpus - A semantically annotated corpus for bio-textmining }},
volume = {19},
year = {2003}
}
@article{Cote2006,
abstract = {BACKGROUND: With the vast amounts of biomedical data being generated by high-throughput analysis methods, controlled vocabularies and ontologies are becoming increasingly important to annotate units of information for ease of search and retrieval. Each scientific community tends to create its own locally available ontology. The interfaces to query these ontologies tend to vary from group to group. We saw the need for a centralized location to perform controlled vocabulary queries that would offer both a lightweight web-accessible user interface as well as a consistent, unified SOAP interface for automated queries. RESULTS: The Ontology Lookup Service (OLS) was created to integrate publicly available biomedical ontologies into a single database. All modified ontologies are updated daily. A list of currently loaded ontologies is available online. The database can be queried to obtain information on a single term or to browse a complete ontology using AJAX. Auto-completion provides a user-friendly search mechanism. An AJAX-based ontology viewer is available to browse a complete ontology or subsets of it. A programmatic interface is available to query the webservice using SOAP. The service is described by a WSDL descriptor file available online. A sample Java client to connect to the webservice using SOAP is available for download from SourceForge. All OLS source code is publicly available under the open source Apache Licence. CONCLUSION: The OLS provides a user-friendly single entry point for publicly available ontologies in the Open Biomedical Ontology (OBO) format. It can be accessed interactively or programmatically at http://www.ebi.ac.uk/ontology-lookup/.},
author = {Cote, RG and Jones, P and Apweiler, R and Hermjakob, H},
doi = {10.1186/1471-2105-7-97},
file = {:Users/cthoyt/ownCloud/Mendeley/2006/The Ontology Lookup Service, a lightweight cross-platform tool for controlled vocabulary queries. - 2006 - Cote et al.pdf:pdf;:Users/cthoyt/ownCloud/Mendeley/2006/The Ontology Lookup Service, a lightweight cross-platform tool for controlled vocabulary queries. - 2006 - Cote et al(2).pdf:pdf},
issn = {1471-2105},
journal = {BMC Bioinformatics},
mendeley-groups = {Thesis,Paper Resources/PyBEL Application Note},
pages = {1--7},
pmid = {16507094},
title = {{ The Ontology Lookup Service, a lightweight cross-platform tool for controlled vocabulary queries. }},
volume = {7},
year = {2006}
}
@article{Laibe2007,
abstract = {BACKGROUND The Minimal Information Requested In the Annotation of biochemical Models (MIRIAM) is a set of guidelines for the annotation and curation processes of computational models, in order to facilitate their exchange and reuse. An important part of the standard consists in the controlled annotation of model components, based on Uniform Resource Identifiers. In order to enable interoperability of this annotation, the community has to agree on a set of standard URIs, corresponding to recognised data types. MIRIAM Resources are being developed to support the use of those URIs. RESULTS MIRIAM Resources are a set of on-line services created to catalogue data types, their URIs and the corresponding physical URLs (or resources), whether data types are controlled vocabularies or primary data resources. MIRIAM Resources are composed of several components: MIRIAM Database stores the information, MIRIAM Web Services allows to programmatically access the database, MIRIAM Library provides an access to the Web Services and MIRIAM Web Application is a way to access the data (human browsing) and also to edit or add entries. CONCLUSIONS The project MIRIAM Resources allows an easy access to MIRIAM URIs and the associated information and is therefore crucial to foster a general use of MIRIAM annotations in computational models of biological processes.},
author = {Laibe, Camille and { Le Nov { \` { e } } re } , Nicolas},
doi = {10.1186/1752-0509-1-58},
file = {:Users/cthoyt/ownCloud/Mendeley/2007/MIRIAM Resources tools to generate and resolve robust cross-references in Systems Biology. - 2007 - Laibe, Le Nov { \` { e } } re.pdf:pdf},
isbn = {1752-0509},
issn = {1752-0509},
journal = {BMC systems biology},
month = {dec},
pages = {58},
pmid = {18078503},
title = {{ MIRIAM Resources: tools to generate and resolve robust cross-references in Systems Biology. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/18078503 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC2259379},
volume = {1},
year = {2007}
}
@article{Taylor2008,
author = {Taylor, Chris F and Field, Dawn and Sansone, Susanna-Assunta and Aerts, Jan and Apweiler, Rolf and Ashburner, Michael and Ball, Catherine A and Binz, Pierre-Alain and Bogue, Molly and Booth, Tim and Brazma, Alvis and Brinkman, Ryan R and { Michael Clark } , Adam and Deutsch, Eric W and Fiehn, Oliver and Fostel, Jennifer and Ghazal, Peter and Gibson, Frank and Gray, Tanya and Grimes, Graeme and Hancock, John M and Hardy, Nigel W and Hermjakob, Henning and Julian, Randall K and Kane, Matthew and Kettner, Carsten and Kinsinger, Christopher and Kolker, Eugene and Kuiper, Martin and { Le Nov { \` { e } } re } , Nicolas and Leebens-Mack, Jim and Lewis, Suzanna E and Lord, Phillip and Mallon, Ann-Marie and Marthandan, Nishanth and Masuya, Hiroshi and McNally, Ruth and Mehrle, Alexander and Morrison, Norman and Orchard, Sandra and Quackenbush, John and Reecy, James M and Robertson, Donald G and Rocca-Serra, Philippe and Rodriguez, Henry and Rosenfelder, Heiko and Santoyo-Lopez, Javier and Scheuermann, Richard H and Schober, Daniel and Smith, Barry and Snape, Jason and Stoeckert, Christian J and Tipton, Keith and Sterk, Peter and Untergasser, Andreas and Vandesompele, Jo and Wiemann, Stefan},
doi = {10.1038/nbt.1411},
issn = {1546-1696},
journal = {Nature biotechnology},
mendeley-groups = {Thesis try 2},
month = {aug},
number = {8},
pages = {889--96},
pmid = {18688244},
title = {{ Promoting coherent minimum reporting guidelines for biological and biomedical investigations: the MIBBI project. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/18688244 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC2771753},
volume = {26},
year = {2008}
}
@article{Marchetti2008,
author = {Marchetti, A and Ronzano, F},
file = {:Users/cthoyt/Dropbox/Mendeley/2008/Formalizing Knowledge by Ontologies OWL and KIF - 2008 - Marchetti, Ronzano.pdf:pdf},
journal = {Relat { \' { o } } rio apresentado L' { \ldots }},
number = {003},
title = {{ Formalizing Knowledge by Ontologies: OWL and KIF }},
url = {http://weblab.iit.cnr.it/kyoto/www2.let.vu.nl/twiki/pub/Kyoto/TechnicalPapers/WP02 { \_ } TR003 { \_ } V2 { \_ } OWL { \_ } KIF.pdf},
year = {2008}
}
@article{Allen2008,
abstract = {We describe a graphical logical formas a semantic representation for text understanding. This representation was designed to bridge the gap be- tween highly expressive "deep" representations of logical forms andmore shallow semantic encodings such as word senses and semantic relations. It preserves rich semantic content while allowing for compact ambigu- ity encoding and viable partial representations. We describe our system for semantic text processing, which has the TRIPS parser at the core, augmented with statistical preprocessing techniques and online lexical lookup. We also present an evaluation metric for the representation and use it to evaluate the performance of the TRIPS parser on the common task paragraphs. 343},
author = {Allen, James F and Swift, Mary and { De Beaumont } , Will},
doi = {10.3115/1626481.1626508},
file = {:Users/cthoyt/Dropbox/Mendeley/2008/Deep semantic analysis of text - 2008 - Allen, Swift, De Beaumont.pdf:pdf},
journal = {Proceedings of the 2008 Conference on Semantics in Text Processing STEP 08},
keywords = {2,allen 1,james f,p semantic analysis of,text},
pages = {343--354},
title = {{ Deep semantic analysis of text }},
url = {http://portal.acm.org/citation.cfm?doid=1626481.1626508},
volume = {1},
year = {2008}
}
@article{Alon2007,
abstract = {Transcription regulation networks control the expression of genes. The transcription networks of well-studied microorganisms appear to be made up of a small set of recurring regulation patterns, called network motifs. The same network motifs have recently been found in diverse organisms from bacteria to humans, suggesting that they serve as basic building blocks of transcription networks. Here I review network motifs and their functions, with an emphasis on experimental studies. Network motifs in other biological networks are also mentioned, including signalling and neuronal networks.},
author = {Alon, Uri},
doi = {10.1038/nrg2102},
file = {:Users/cthoyt/Dropbox/Mendeley/2007/Network motifs theory and experimental approaches. - 2007 - Alon.pdf:pdf},
isbn = {1471-0056 (Print)$\backslash$r1471-0056 (Linking)},
issn = {1471-0056},
journal = {Nature reviews. Genetics},
keywords = {Animals,Bacteria,Bacteria: genetics,Bacteria: metabolism,Evolution,Fungi,Fungi: genetics,Fungi: metabolism,Gene Expression Regulation,Genetic,Homeostasis,Humans,Models,Regulon,Regulon: genetics,Transcription,Transcription Factors,Transcription Factors: genetics,Transcription Factors: metabolism},
number = {6},
pages = {450--61},
pmid = {17510665},
title = {{ Network motifs: theory and experimental approaches. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/17510665},
volume = {8},
year = {2007}
}
@article{Ashburner2000,
author = {Ashburner, M and Ball, C A and Blake, J A and Botstein, D and Butler, H and Cherry, J M and Davis, A P and Dolinski, K and Dwight, S S and Eppig, J T and Harris, M A and Hill, D P and Issel-Tarver, L and Kasarskis, A and Lewis, S and Matese, J C and Richardson, J E and Ringwald, M and Rubin, G M and Sherlock, G},
doi = {10.1038/75556},
issn = {1061-4036},
journal = {Nature genetics},
month = {may},
number = {1},
pages = {25--9},
pmid = {10802651},
title = {{ Gene ontology: tool for the unification of biology. The Gene Ontology Consortium. }},
url = {http://www.ncbi.nlm.nih.gov/pubmed/10802651 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC3037419},
volume = {25},
year = {2000}
}
@article{Bateman2017,
abstract = {{ \textcopyright } 2016 The Author(s). The UniProt knowledgebase is a large resource of protein sequences and associated detailed annotation. The database contains over 60 million sequences, of which over half a million sequences have been curated by experts who critically review experimental and predicted data for each protein. The remainder are automatically annotated based on rule systems that rely on the expert curated knowledge. Since our last update in 2014, we have more than doubled the number of reference proteomes to 5631, giving a greater coverage of taxonomic diversity. We implemente d a pipeline to remove redundant highly similar proteomes that were causing excessive redundancy in UniProt. The initial run of this pipeline reduced the number of sequences in UniProt by 47 million. For our users interested in the accessory proteomes, we have made available sets of pan proteome sequences that cover the diversity of sequences for each species that is found in its strains and sub-strains. To help interpretation of genomic variants, we provide tracks of detailed protein information for the major genome browsers. We provide a SPARQL endpoint that allows complex queries of the more than 22 billion triples of data in UniProt (http://sparql.uniprot.org/). UniProt resources can be accessed via the website at http://www.uniprot.org/.},
author = {Bateman, Alex and Martin, Maria Jesus and O'Donovan, Claire and Magrane, Michele and Alpi, Emanuele and Antunes, Ricardo and Bely, Benoit and Bingley, Mark and Bonilla, Carlos and Britto, Ramona and Bursteinas, Borisas and Bye-AJee, Hema and Cowley, Andrew and { Da Silva } , Alan and { De Giorgi } , Maurizio and Dogan, Tunca and Fazzini, Francesco and Castro, Leyla Garcia and Figueira, Luis and Garmiri, Penelope and Georghiou, George and Gonzalez, Daniel and Hatton-Ellis, Emma and Li, Weizhong and Liu, Wudong and Lopez, Rodrigo and Luo, Jie and Lussi, Yvonne and MacDougall, Alistair and Nightingale, Andrew and Palka, Barbara and Pichler, Klemens and Poggioli, Diego and Pundir, Sangya and Pureza, Luis and Qi, Guoying and Rosanoff, Steven and Saidi, Rabie and Sawford, Tony and Shypitsyna, Aleksandra and Speretta, Elena and Turner, Edward and Tyagi, Nidhi and Volynkin, Vladimir and Wardell, Tony and Warner, Kate and Watkins, Xavier and Zaru, Rossana and Zellner, Hermann and Xenarios, Ioannis and Bougueleret, Lydie and Bridge, Alan and Poux, Sylvain and Redaschi, Nicole and Aimo, Lucila and ArgoudPuy, Ghislaine and Auchincloss, Andrea and Axelsen, Kristian and Bansal, Parit and Baratin, Delphine and Blatter, Marie Claude and Boeckmann, Brigitte and Bolleman, Jerven and Boutet, Emmanuel and Breuza, Lionel and Casal-Casas, Cristina and { De Castro } , Edouard and Coudert, Elisabeth and Cuche, Beatrice and Doche, Mikael and Dornevil, Dolnide and Duvaud, Severine and Estreicher, Anne and Famiglietti, Livia and Feuermann, Marc and Gasteiger, Elisabeth and Gehant, Sebastien and Gerritsen, Vivienne and Gos, Arnaud and Gruaz-Gumowski, Nadine and Hinz, Ursula and Hulo, Chantal and Jungo, Florence and Keller, Guillaume and Lara, Vicente and Lemercier, Philippe and Lieberherr, Damien and Lombardot, Thierry and Martin, Xavier and Masson, Patrick and Morgat, Anne and Neto, Teresa and Nouspikel, Nevila and Paesano, Salvo and Pedruzzi, Ivo and Pilbout, Sandrine and Pozzato, Monica and Pruess, Manuela and Rivoire, Catherine and Roechert, Bernd and Schneider, Michel and Sigrist, Christian and Sonesson, Karin and Staehli, Sylvie and Stutz, Andre and Sundaram, Shyamala and Tognolli, Michael and Verbregue, Laure and Veuthey, Anne Lise and Wu, Cathy H. and Arighi, Cecilia N. and Arminski, Leslie and Chen, Chuming and Chen, Yongxing and Garavelli, John S. and Huang, Hongzhan and Laiho, Kati and McGarvey, Peter and Natale, Darren A. and Ross, Karen and Vinayaka, C. R. and Wang, Qinghua and Wang, Yuqi and Yeh, Lai Su and Zhang, Jian},
doi = {10.1093/nar/gkw1099},
file = {:Users/cthoyt/Dropbox/Mendeley/2017/UniProt The universal protein knowledgebase - 2017 - Bateman et al.pdf:pdf},
issn = {13624962},
journal = {Nucleic Acids Research},
number = {D1},
pages = {D158--D169},
title = {{ UniProt: The universal protein knowledgebase }},
volume = {45},
year = {2017}
}
@misc{Beckett2014,
author = {Beckett, Dave},
booktitle = {W3C Recommendation},
title = {{ RDF/XML Syntax Specification }},
url = {https://www.w3.org/TR/REC-rdf-syntax/},
urldate = {2017-08-19},
year = {2014}
}
@article{Bellazzi2014,
abstract = {Big data are receiving an increasing attention in biomedicine and healthcare. It is therefore important to understand the reason why big data are assuming a crucial role for the biomedical informatics community. The capability of handling big data is becoming an enabler to carry out unprecedented research studies and to implement new models of healthcare delivery. Therefore, it is first necessary to deeply understand the four elements that constitute big data, namely Volume, Variety, Velocity, and Veracity, and their meaning in practice. Then, it is mandatory to understand where big data are present, and where they can be beneficially collected. There are research fields, such as translational bioinformatics, which need to rely on big data technologies to withstand the shock wave of data that is generated every day. Other areas, ranging from epidemiology to clinical care, can benefit from the exploitation of the large amounts of data that are nowadays available, from personal monitoring to primary care. However, building big data-enabled systems carries on relevant implications in terms of reproducibility of research studies and management of privacy and data access; proper actions should be taken to deal with these issues. An interesting consequence of the big data scenario is the availability of new software, methods, and tools, such as map-reduce, cloud computing, and concept drift machine learning algorithms, which will not only contribute to big data research, but may be beneficial in many biomedical informatics applications. The way forward with the big data opportunity will require properly applied engineering principles to design studies and applications, to avoid preconceptions or over-enthusiasms, to fully exploit the available technologies, and to improve data processing and data management regulations.},
author = {Bellazzi, R},
doi = {10.15265/IY-2014-0024},
file = {:Users/cthoyt/Dropbox/Mendeley/2014/Big data and biomedical informatics a challenging opportunity. - 2014 - Bellazzi.pdf:pdf},
issn = {2364-0502},
journal = {Yearbook of medical informatics},
keywords = {big data,cloud,data analytics,nosql,research reproducibility},
pages = {8--13},
pmid = {24853034},
title = {{ Big data and biomedical informatics: a challenging opportunity. }},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=4287065 { \& } tool=pmcentrez { \& } rendertype=abstract},
volume = {9},
year = {2014}
}
@article{Belleau2008,
author = {Belleau, Fran { \c { c } } ois and Nolin, Marc-Alexandre and Tourigny, Nicole and Rigault, Philippe and Morissette, Jean},
doi = {10.1016/j.jbi.2008.03.004},
file = {:Users/cthoyt/Dropbox/Mendeley/2008/Bio2RDF Towards a mashup to build bioinformatics knowledge systems - 2008 - Belleau et al.pdf:pdf},
issn = {15320464},
journal = {Journal of Biomedical Informatics},
keywords = {Data Integration,Folder - RDF and Semantic Web,Linked Biological Data,RDF},
language = {en},
mendeley-tags = {Data Integration,Folder - RDF and Semantic Web,Linked Biological Data,RDF},
month = {oct},
number = {5},
pages = {706--716},
shorttitle = {Bio2RDF},
title = {{ Bio2RDF: Towards a mashup to build bioinformatics knowledge systems }},
url = {http://linkinghub.elsevier.com/retrieve/pii/S1532046408000415},
volume = {41},
year = {2008}
}
@article{Bento2014,
abstract = {ChEMBL is an open large-scale bioactivity database (https://www.ebi.ac.uk/chembl), previously described in the 2012 Nucleic Acids Research Database Issue. Since then, a variety of new data sources and improvements in functionality have contributed to the growth and utility of the resource. In particular, more comprehensive tracking of compounds from research stages through clinical development to market is provided through the inclusion of data from United States Adopted Name applications; a new richer data model for representing drug targets has been developed; and a number of methods have been put in place to allow users to more easily identify reliable data. Finally, access to ChEMBL is now available via a new Resource Description Framework format, in addition to the web-based interface, data downloads and web services.},
author = {Bento, A. Patr { \' { i } } cia and Gaulton, Anna and Hersey, Anne and Bellis, Louisa J. and Chambers, Jon and Davies, Mark and Kr { \" { u } } ger, Felix A. and Light, Yvonne and Mak, Lora and McGlinchey, Shaun and Nowotka, Michal and Papadatos, George and Santos, Rita and Overington, John P.},
doi = {10.1093/nar/gkt1031},
file = {:Users/cthoyt/Dropbox/Mendeley/2014/The ChEMBL bioactivity database An update - 2014 - Bento et al.pdf:pdf},
issn = {03051048},
journal = {Nucleic Acids Research},
number = {D1},
pages = {1083--1090},
pmid = {24214965},
title = {{ The ChEMBL bioactivity database: An update }},
volume = {42},
year = {2014}
}
@article{Bernabo2014,
abstract = {Cellular signal transduction is a complex phenomenon, which plays a central role in cell surviving and adaptation. The great amount of molecular data to date present in literature, together with the adoption of high throughput technologies, on the one hand, made available to scientists an enormous quantity of information, on the other hand, failed to provide a parallel increase in the understanding of biological events. In this context, a new discipline arose, the systems biology, aimed to manage the information with a computational modeling-based approach. In particular, the use of biological networks has allowed the making of huge progress in this field. Here we discuss two possible application of the use of biological networks to explore cell signaling: the study of the architecture of signaling systems that cooperate in determining the acquisition of a complex cellular function (as it is the case of the process of activation of spermatozoa) and the organization of a single specific signaling systems expressed by different cells in different tissues (i.e. the endocannabinoid system). In both the cases we have found that the networks follow a scale free and small world topology, likely due to the evolutionary advantage of robustness against random damages, fastness and specific of information processing, and easy navigability.},
author = {Bernab??, Nicola and Barboni, Barbara and Maccarrone, Mauro},
doi = {10.1016/j.csbj.2014.09.002},
file = {:Users/cthoyt/Dropbox/Mendeley/2014/The biological networks in studying cell signal transduction complexity The examples of sperm capacitation and of endocannabinoid system.pdf:pdf},
isbn = {2001-0370},
issn = {20010370},
journal = {Computational and Structural Biotechnology Journal},
keywords = {Biological networks,Endocannabinoid system,Network topology,Signal transduction,Spermatozoa,Systems biology},
number = {18},
pages = {11--21},
pmid = {25379139},
publisher = {Elsevier B.V.},
title = {{ The biological networks in studying cell signal transduction complexity: The examples of sperm capacitation and of endocannabinoid system }},
url = {http://dx.doi.org/10.1016/j.csbj.2014.09.002},
volume = {11},
year = {2014}
}
@article{Blalock2011,
archivePrefix = {arXiv},
arxivId = {NIHMS150003},
author = {Blalock, Eric M. and Buechel, Heather M. and Popovic, Jelena and Geddes, James W. and Landfield, Philip W.},
doi = {10.1007/s12020-009-9266-z.A},
eprint = {NIHMS150003},
file = {:Users/cthoyt/Dropbox/Mendeley/2011/Microarray analyses of laser-captured hippocampus reveal distinct gray and white matter signatures associated with incipient Alzheimer.pdf:pdf},
isbn = {6176321972},
issn = {15378276},
number = {1},
pages = {62--70},
pmid = {1000000221},
title = {{ Microarray analyses of laser-captured hippocampus reveal distinct gray and white matter signatures associated with incipient Alzheimer's disease }},
volume = {37},
year = {2011}
}
@article{Bodenreider2008,
author = {Bodenreider, O},
file = {:Users/cthoyt/Dropbox/Mendeley/2008/Biomedical Ontologies in Action Role in Knowledge Management , Data Integration and Decision Support - 2008 - Bodenreider.pdf:pdf},
keywords = {biomedical ontologies,data integration,knowledge management},
pages = {67--79},
title = {{ Biomedical Ontologies in Action : Role in Knowledge Management , Data Integration and Decision Support }},
year = {2008}
}
@misc{Bostock,
author = {Bostock, Mike},
title = {{ D3.js }},
url = {http://d3js.org}
}
@article{Catlett2013,
abstract = {BACKGROUND: Gene expression profiling and other genome-scale measurement technologies provide comprehensive information about molecular changes resulting from a chemical or genetic perturbation, or disease state. A critical challenge is the development of methods to interpret these large-scale data sets to identify specific biological mechanisms that can provide experimentally verifiable hypotheses and lead to the understanding of disease and drug action.$\backslash$n$\backslash$nRESULTS: We present a detailed description of Reverse Causal Reasoning (RCR), a reverse engineering methodology to infer mechanistic hypotheses from molecular profiling data. This methodology requires prior knowledge in the form of small networks that causally link a key upstream controller node representing a biological mechanism to downstream measurable quantities. These small directed networks are generated from a knowledge base of literature-curated qualitative biological cause-and-effect relationships expressed as a network. The small mechanism networks are evaluated as hypotheses to explain observed differential measurements. We provide a simple implementation of this methodology, Whistle, specifically geared towards the analysis of gene expression data and using prior knowledge expressed in Biological Expression Language (BEL). We present the Whistle analyses for three transcriptomic data sets using a publically available knowledge base. The mechanisms inferred by Whistle are consistent with the expected biology for each data set.$\backslash$n$\backslash$nCONCLUSIONS: Reverse Causal Reasoning yields mechanistic insights to the interpretation of gene expression profiling data that are distinct from and complementary to the results of analyses using ontology or pathway gene sets. This reverse engineering algorithm provides an evidence-driven approach to the development of models of disease, drug action, and drug toxicity.},
author = {Catlett, Natalie L and Bargnesi, Anthony J and Ungerer, Stephen and Seagaran, Toby and Ladd, William and Elliston, Keith O and Pratt, Dexter},
doi = {10.1186/1471-2105-14-340},
file = {:Users/cthoyt/Dropbox/Mendeley/2013/Reverse causal reasoning applying qualitative causal knowledge to the interpretation of high-throughput data. - 2013 - Catlett et al.pdf:pdf},
issn = {1471-2105},
journal = {BMC bioinformatics},
keywords = {Algorithms,Animals,Breast,Breast: cytology,Endothelium, Vascular,Endothelium, Vascular: cytology,Epithelial Cells,Epithelial Cells: cytology,Gene Expression Profiling,Gene Expression Profiling: methods,Genome, Human,High-Throughput Nucleotide Sequencing,High-Throughput Nucleotide Sequencing: methods,Histone-Lysine N-Methyltransferase,Histone-Lysine N-Methyltransferase: genetics,Humans,Insulin Resistance,Insulin Resistance: genetics,Knowledge Bases,Mice,Microarray Analysis,Molecular Probes,Molecular Probes: genetics,Nuclear Proteins,Nuclear Proteins: genetics},
number = {1},
pages = {340},
pmid = {24266983},
title = {{ Reverse causal reasoning: applying qualitative causal knowledge to the interpretation of high-throughput data. }},
url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=4222496 { \& } tool=pmcentrez { \& } rendertype=abstract},
volume = {14},
year = {2013}
}
@article{Cerami2011,
abstract = {Pathway Commons (http://www.pathwaycommons.org) is a collection of publicly available pathway data from multiple organisms. Pathway Commons provides a web-based interface that enables biologists to browse and search a comprehensive collection of pathways from multiple sources represented in a common language, a download site that provides integrated bulk sets of pathway information in standard or convenient formats and a web service that software developers can use to conveniently query and access all data. Database providers can share their pathway data via a common repository. Pathways include biochemical reactions, complex assembly, transport and catalysis events and physical interactions involving proteins, DNA, RNA, small molecules and complexes. Pathway Commons aims to collect and integrate all public pathway data available in standard formats. Pathway Commons currently contains data from nine databases with over 1400 pathways and 687,000 interactions and will be continually expanded and updated.},
author = {Cerami, Ethan G. and Gross, Benjamin E. and Demir, Emek and Rodchenkov, Igor and Babur, { \" { O } } zg { \" { u } } n and Anwar, Nadia and Schultz, Nikolaus and Bader, Gary D. and Sander, Chris},
doi = {10.1093/nar/gkq1039},
file = {:Users/cthoyt/Dropbox/Mendeley/2011/Pathway Commons, a web resource for biological pathway data - 2011 - Cerami et al.pdf:pdf},
issn = {03051048},
journal = {Nucleic Acids Research},
number = {SUPPL. 1},
pages = {685--690},
pmid = {21071392},
title = {{ Pathway Commons, a web resource for biological pathway data }},
volume = {39},
year = {2011}
}
@article{Chindelevitch2012,
abstract = {MOTIVATION: The interpretation of high-throughput datasets has remained one of the central challenges of computational biology over the past decade. Furthermore, as the amount of biological knowledge increases, it becomes more and more difficult to integrate this large body of knowledge in a meaningful manner. In this article, we propose a particular solution to both of these challenges.$\backslash$n$\backslash$nMETHODS: We integrate available biological knowledge by constructing a network of molecular interactions of a specific kind: causal interactions. The resulting causal graph can be queried to suggest molecular hypotheses that explain the variations observed in a high-throughput gene expression experiment. We show that a simple scoring function can discriminate between a large number of competing molecular hypotheses about the upstream cause of the changes observed in a gene expression profile. We then develop an analytical method for computing the statistical significance of each score. This analytical method also helps assess the effects of random or adversarial noise on the predictive power of our model.$\backslash$n$\backslash$nRESULTS: Our results show that the causal graph we constructed from known biological literature is extremely robust to random noise and to missing or spurious information. We demonstrate the power of our causal reasoning model on two specific examples, one from a cancer dataset and the other from a cardiac hypertrophy experiment. We conclude that causal reasoning models provide a valuable addition to the biologist's toolkit for the interpretation of gene expression data.$\backslash$n$\backslash$nAVAILABILITY AND IMPLEMENTATION: R source code for the method is available upon request.},
author = {Chindelevitch, Leonid and Ziemek, Daniel and Enayetallah, Ahmed and Randhawa, Ranjit and Sidders, Ben and Brockel, Christoph and Huang, Enoch S.},
doi = {10.1093/bioinformatics/bts090},
issn = {13674803},
journal = {Bioinformatics},
pmid = {22355083},
title = {{ Causal reasoning on biological networks: Interpreting transcriptional changes }},
year = {2012}
}
@article{Chou2016,
abstract = {MicroRNAs (miRNAs) are small non-coding RNAs of approximately 22 nucleotides, which negatively regulate the gene expression at the post-transcriptional level. This study describes an update of the miRTarBase (http://miRTarBase.mbc.nctu.edu.tw/) that provides information about experimentally validated miRNA-target interactions (MTIs). The latest update of the miRTarBase expanded it to identify systematically Argonaute-miRNA-RNA interactions from 138 crosslinking and immunoprecipitation sequencing (CLIP-seq) data sets that were generated by 21 independent studies. The database contains 4966 articles, 7439 strongly validated MTIs (using reporter assays or western blots) and 348 007 MTIs from CLIP-seq. The number of MTIs in the miRTarBase has increased around 7-fold since the 2014 miRTarBase update. The miRNA and gene expression profiles from The Cancer Genome Atlas (TCGA) are integrated to provide an effective overview of this exponential growth in the miRNA experimental data. These improvements make the miRTarBase one of the more comprehensively annotated, experimentally validated miRNA-target interactions databases and motivate additional miRNA research efforts.},
author = {Chou, Chih Hung and Chang, Nai Wen and Shrestha, Sirjana and Hsu, Sheng Da and Lin, Yu Ling and Lee, Wei Hsiang and Yang, Chi Dung and Hong, Hsiao Chin and Wei, Ting Yen and Tu, Siang Jyun and Tsai, Tzi Ren and Ho, Shu Yi and Jian, Ting Yan and Wu, Hsin Yi and Chen, Pin Rong and Lin, Nai Chieh and Huang, Hsin Tzu and Yang, Tzu Ling and Pai, Chung Yuan and Tai, Chun San and Chen, Wen Liang and Huang, Chia Yen and Liu, Chun Chi and Weng, Shun Long and Liao, Kuang Wen and Hsu, Wen Lian and Huang, Hsien Da},
doi = {10.1093/nar/gkv1258},
file = {:Users/cthoyt/Dropbox/Mendeley/2016/miRTarBase 2016 Updates to the experimentally validated miRNA-target interactions database - 2016 - Chou et al.pdf:pdf},
issn = {13624962},
journal = {Nucleic Acids Research},
number = {D1},
pages = {D239--D247},
pmid = {26590260},
title = {{ miRTarBase 2016: Updates to the experimentally validated miRNA-target interactions database }},
volume = {44},
year = {2016}
}
@article{Davidson1995,
abstract = {Scientific data of importance to biologists reside in a number of different data sources, such as GenBank, GSDB, SWISS-PROT, EMBL, and OMIM, among many others. Some of these data sources are conventional databases implemented using database management systems (DBMSs) and others are structured files maintained in a number of different formats (e.g., ASN.1 and ACE). In addition, software packages such as sequence analysis packages (e.g., BLAST and FASTA) produce data and can therefore be viewed as data sources. To counter the increasing dispersion and heterogeneity of data, different approaches to integrating these data sources are appearing throughout the bioinformatics community. This paper surveys the technical challenges to integration, classifies the approaches, and critiques the available tools and methodologies.},
author = {Davidson, S B and Overton, C and Buneman, P},
doi = {10.1089/cmb.1995.2.557},
issn = {1066-5277 (Print)},
journal = {Journal of computational biology : a journal of computational molecular cell biology},
keywords = {Chromosomes, Artificial, Yeast,Data Interpretation, Statistical,Database Management Systems,Databases, Factual,Humans,Mathematics,Models, Genetic,Molecular Biology,Polymerase Chain Reaction,Repetitive Sequences, Nucleic Acid,Sequence Tagged Sites,Software},
language = {eng},
number = {4},
pages = {557--572},
pmid = {8634908},
title = {{ Challenges in integrating biological data sources. }},
volume = {2},
year = {1995}
}
@article{Davis2017,
abstract = {The Comparative Toxicogenomics Database (CTD; http://ctdbase.org/) provides information about interactions between environmental chemicals and gene products and their relationships to diseases. Chemical-gene, chemical-disease and gene-disease interactions manually curated from the literature are integrated to generate expanded networks and predict many novel associations between different data types. CTD now contains over 15 million toxicogenomic relationships. To navigate this sea of data, we added several new features, including DiseaseComps (which finds comparable diseases that share toxicogenomic profiles), statistical scoring for inferred gene-disease and pathway-chemical relationships, filtering options for several tools to refine user analysis and our new Gene Set Enricher (which provides biological annotations that are enriched for gene sets). To improve data visualization, we added a Cytoscape Web view to our ChemComps feature, included color-coded interactions and created a 'slim list' for our MEDIC disease vocabulary (allowing diseases to be grouped for meta-analysis, visualization and better data management). CTD continues to promote interoperability with external databases by providing content and cross-links to their sites. Together, this wealth of expanded chemical-gene-disease data, combined with novel ways to analyze and view content, continues to help users generate testable hypotheses about the molecular mechanisms of environmental diseases.},
author = {Davis, Allan Peter and Grondin, Cynthia J. and Johnson, Robin J. and Sciaky, Daniela and King, Benjamin L. and McMorran, Roy and Wiegers, Jolene and Wiegers, Thomas C. and Mattingly, Carolyn J.},
doi = {10.1093/nar/gkw838},
file = {:Users/cthoyt/Dropbox/Mendeley/2017/The Comparative Toxicogenomics Database Update 2017 - 2017 - Davis et al.pdf:pdf},
issn = {13624962},
journal = {Nucleic Acids Research},
number = {D1},
pages = {D972--D978},
pmid = {23093600},
title = {{ The Comparative Toxicogenomics Database: Update 2017 }},
volume = {45},
year = {2017}
}