-
Notifications
You must be signed in to change notification settings - Fork 15
/
adstex.py
578 lines (502 loc) · 17.7 KB
/
adstex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
"""
adstex: Automated generation of NASA ADS bibtex entries
from citation keys (identifiers, author+year) in your TeX source files.
Project website: https://github.com/yymao/adstex
The MIT License (MIT)
Copyright (c) 2015-2024 Yao-Yuan Mao (yymao)
http://opensource.org/licenses/MIT
"""
from __future__ import absolute_import, print_function
import os
import re
import sys
import warnings
from argparse import ArgumentParser
from builtins import input
from collections import defaultdict
from datetime import date
from shutil import copyfile
from joblib import Parallel, delayed
import ads
import bibtexparser
import packaging.version
import requests
try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
__version__ = "0.6.0"
_this_year = date.today().year % 100
_this_cent = date.today().year // 100
_re_comment = re.compile(r"(?<!\\)%.*(?=[\r\n])")
_re_bib = re.compile(r"\\(?:no)?bibliography\*?(?:(?!\n{2,})\s)*{((?:(?!\n{2,})[^{}])+)}")
_re_cite = re.compile(
r"\\(?:bibentry|[cC]ite[a-zA]{0,7})\*?(?:(?!\n{2,})\s)*(?:(?<!\\)[\[<](?:(?!\n{2,}).)*?(?<!\\)[\]>](?:(?!\n{2,})\s)*)*{((?:(?!\n{2,})[^{}])+)}",
re.S,
)
_re_fayear = re.compile(r"([A-Za-z-:]+)(?:(?=[\W_])[^\s\d,]+)?((?:\d{2})?\d{2})")
_re_id = {}
_re_id["doi"] = re.compile(r"\b10\.\d{4,}(?:\.\d+)*\/(?:(?!['\"&<>])\S)+\b")
_re_id["bibcode"] = re.compile(r"\b\d{4}\D\S{13}[A-Z.:]\b")
_re_id["arxiv"] = re.compile(r"\b(?:\d{4}\.\d{4,5}|[a-z-]+(?:\.[A-Za-z-]+)?\/\d{7})\b")
_name_prefix = (
"van",
"di",
"de",
"den",
"der",
"van de",
"van den",
"van der",
"von der",
)
_name_prefix = sorted(_name_prefix, key=len, reverse=True)
# global configs
_DATABASE = "astronomy"
_DISABLE_SSL = False
_USE_COAUTHORS = False
def fixedAdsSearchQuery(*args, **kwargs):
q = ads.SearchQuery(*args, **kwargs)
q.session.headers.pop("Content-Type", None)
if _DISABLE_SSL:
q.session.verify = False
return q
def fixedAdsExportQuery(*args, **kwargs):
q = ads.ExportQuery(*args, **kwargs)
if _DISABLE_SSL:
q.session.verify = False
return q
def get_bparser():
mybparser = bibtexparser.bparser.BibTexParser(
common_strings=True,
ignore_nonstandard_types=False,
)
mybparser.bib_database.strings["june"] = "June"
return mybparser
def _match_name_prefix(name):
for prefix in _name_prefix:
p = prefix.replace(" ", "")
if name.lower().startswith(p):
return " ".join((prefix, name[len(p) :]))
def _y2toy4(y2):
y2 = int(y2)
k = int(y2 > _this_year)
return str((_this_cent - k) * 100 + y2)
def _split_authors(fa):
fa = fa.strip(':').split(':')
if _USE_COAUTHORS and len(fa) > 1:
return fa[0], fa[1:]
return fa[0], None
def _is_like_string(s):
try:
s + ""
except TypeError:
return False
return True
def _headerize(msg, extraline=True):
return "{2}{0}\n{1}\n{0}".format("-" * 60, msg, "\n" if extraline else "")
def search_keys(files, find_bib=False):
if _is_like_string(files):
files = [files]
bib = None
keys = set()
for f in files:
with open(f) as fp:
text = fp.read()
text = _re_comment.sub("", text)
if find_bib and not bib:
m = _re_bib.search(text)
if m:
dirpath = os.path.dirname(f)
bib = []
for b in m.groups()[0].split(","):
b = b.strip()
if not b.lower().endswith(".bib"):
b += ".bib"
bib.append(os.path.join(dirpath, b))
for m in _re_cite.finditer(text):
for k in m.groups()[0].split(","):
keys.add(k.strip())
return keys, bib
def format_author(authors, max_char):
s = authors[0]
for author in authors[1:]:
if len(s) + len(author) + 2 < max_char - 7:
s = u"{}; {}".format(s, author)
else:
break
else:
return s
return s + u" et al."
def format_ads_entry(i, entry, max_char=78):
title = entry.title[0][: max_char - 4] if entry.title else "<no title>"
return u"[{}] {} (cited {} times)\n {}\n {}".format(
i,
entry.bibcode,
entry.citation_count,
format_author(entry.author, max_char - 4),
title,
)
def id2bibcode(id_this, possible_id_types=("bibcode", "doi", "arxiv")):
if _is_like_string(possible_id_types):
possible_id_types = [possible_id_types]
for id_type in possible_id_types:
m = _re_id[id_type].search(id_this)
if m:
s = fixedAdsSearchQuery(q="identifier:\"{}\"".format(m.group()), fl=["bibcode"])
try:
return next(s).bibcode
except (StopIteration, ads.exceptions.APIResponseError):
pass
def authoryear2bibcode(author, year, key, coauthors=None):
coauthors = ' '.join([f'author:"{_a}"' for _a in coauthors]) if coauthors else ""
q = 'first_author:"{}" {} year:{} database:{}'.format(author, coauthors, year, _DATABASE)
entries = list(
fixedAdsSearchQuery(
q=q,
fl=["id", "author", "bibcode", "title", "citation_count"],
sort="citation_count desc",
rows=20,
max_pages=0,
)
)
if entries:
total = len(entries)
print(
_headerize(
"Choose one entry from below for <{}> (most cited at the end)".format(
key
)
)
)
print(
u"\n\n".join(
format_ads_entry(total - i, e) for i, e in enumerate(reversed(entries))
)
)
print(
_headerize(
"Choose one entry from above for <{}>".format(key),
extraline=False,
)
)
choices = list(range(0, len(entries) + 1))
c = -1
while c not in choices:
c = input(
"ENTER choice (if no matches, ENTER 0 to skip or ENTER an identifier): "
)
bibcode = id2bibcode(c)
if bibcode:
return bibcode
try:
c = int(c)
except (TypeError, ValueError):
pass
if not c:
return
return entries[c - 1].bibcode
elif " " not in author:
new_author = _match_name_prefix(author)
if new_author:
return authoryear2bibcode(new_author, year, key)
def find_bibcode_interactive(key):
m = _re_fayear.match(key)
if m:
fa, y = m.groups()
fa, ca = _split_authors(fa)
if len(y) == 2:
y = _y2toy4(y)
bibcode = authoryear2bibcode(fa, y, key, coauthors=ca)
if bibcode:
return bibcode
print(_headerize("ENTER an identifier (bibcode, arxiv, doi) for <{}>".format(key)))
c = True
while c:
c = input("Identifier (or press ENTER to skip): ")
bibcode = id2bibcode(c)
if bibcode:
return bibcode
def extract_bibcode(entry):
m = _re_id["bibcode"].search(unquote(entry.get("adsurl", "")))
if m:
return m.group()
def entry2bibcode(entry):
for field_name, possible_id_types in (
("adsurl", "bibcode"),
("doi", "doi"),
("eprint", "arxiv"),
("url", ("bibcode", "doi", "arxiv")),
("pages", "arxiv"),
):
if field_name in entry:
id_this = id2bibcode(unquote(entry[field_name]), possible_id_types)
if id_this:
return id_this
def update_bib(b1, b2):
entries_dict = dict()
for entry in b1.entries:
entries_dict[entry['ID']] = entry
for entry in b2.entries:
entries_dict[entry['ID']] = entry
b1.entries = list(entries_dict.values())
try:
b1._entries_dict.clear()
except AttributeError:
pass
return b1
def main():
parser = ArgumentParser()
parser.add_argument(
"files", metavar="TEX", nargs="+", help="tex files to search citation keys"
)
parser.add_argument(
"-o",
"--output",
metavar="BIB",
help="main bibtex file; new entries will be added to this file, existing entries may be updated",
)
parser.add_argument(
"-r",
"--other",
nargs="+",
metavar="BIB",
help="other bibtex files that contain existing references (read-only)",
)
parser.add_argument(
"--no-update",
dest="update",
action="store_false",
help="for existing entries, do not check ADS for updates",
)
parser.add_argument(
"--force-regenerate",
action="store_true",
help="for all existing entries, regenerate the bibtex with the latest version from ADS if found",
)
parser.add_argument(
"--merge-other",
action="store_true",
help="merge the entries from other bibtex files",
) # thanks to syrte for adding this option
parser.add_argument(
"--include-physics",
action="store_true",
help="include physics database when searching ADS",
)
parser.add_argument(
"--no-backup",
dest="backup",
action="store_false",
help="back up output file if being overwritten",
)
parser.add_argument(
"--disable-ssl-verification",
action="store_true",
help="disable SSL verification (it will render your API key vulnerable)",
)
parser.add_argument(
"--use-coauthors",
action="store_true",
help="include coauthors (in the format of 'fa:ca1:ca2:year') in ADS search",
) # thanks to birnstiel for making this suggestion
parser.add_argument(
"--parallel",
"-P",
"-p",
action="store_true",
help="enable parallel ADS update queries",
) # thanks to dwijn for adding this option
parser.add_argument(
"--threads",
default=8,
type=int,
help="specify the number of threads used when --parallel is set (default: 8)",
) # thanks to dwijn for adding this option
parser.add_argument(
"--ignore-env-args",
action="store_true",
help="ignore the arguments set in ADSTEX_ARGS environment variable",
) # thanks to birnstiel for making this suggestion
parser.add_argument(
"--version",
action="version",
version="%(prog)s {version}".format(version=__version__),
)
args = parser.parse_args()
env_args = os.getenv("ADSTEX_ARGS")
if env_args and not args.ignore_env_args:
args = parser.parse_args(sys.argv[1:] + env_args.strip().split())
if args.include_physics:
global _DATABASE
_DATABASE = '("astronomy" OR "physics")'
if args.disable_ssl_verification:
ans = input("You have chosen to disable SSL verification. This will render your API key vulnerable. Do you want to continue? [y/N] ")
if ans in ("y", "Y", "yes", "Yes", "YES"):
global _DISABLE_SSL
_DISABLE_SSL = True
warnings.filterwarnings("ignore", "Unverified HTTPS request is being made", Warning)
else:
print("OK, abort!")
return
if args.use_coauthors:
global _USE_COAUTHORS
_USE_COAUTHORS = True
if len(args.files) == 1 and args.files[0].lower().endswith(".bib"): # bib update mode
if args.output or args.other:
parser.error(
"Input file is a bib file, not tex file. This will enter bib update mode. Do not specify `--output` and `--other` together in this mode."
)
if not args.update:
parser.error(
"Input file is a bib file, not tex file. This will enter bib update mode. Must not specify --no-update"
)
if not os.path.isfile(args.files[0]):
parser.error("Cannot locate input bib file {}".format(args.files[0]))
keys = None
args.output = args.files[0]
elif args.output: # bib output is specified
keys, _ = search_keys(args.files, find_bib=False)
else: # bib output is missing, auto-identify
keys, bib = search_keys(args.files, find_bib=True)
if not bib:
parser.error(
"Cannot identify bibtex file from the tex source. Use -o to specify a bibtex file as output."
)
args.output = bib.pop(0)
if args.other:
args.other.extend(bib)
else:
args.other = bib
msg = "Auto-identifying bibtex files...\n"
msg += "Main bibtex source (output file): {}\n".format(args.output)
if args.other:
msg += "Additional bibtex sources: {}\n".format(", ".join(args.other))
print(_headerize(msg))
if os.path.isfile(args.output):
with open(args.output) as fp:
bib = bibtexparser.load(fp, parser=get_bparser())
else:
bib = bibtexparser.loads(" ", parser=get_bparser())
bib_other = bibtexparser.loads(" ", parser=get_bparser())
if args.other:
for f in args.other:
with open(f) as fp:
bib_other = update_bib(
bib_other, bibtexparser.load(fp, parser=get_bparser())
)
if keys is None: # bib update mode
keys = list(bib.entries_dict)
interactive = set()
not_found = set()
to_retrieve = set()
all_entries = defaultdict(list)
def update(key):
key_exists = key in bib.entries_dict
key_exists_in_others = key in bib_other.entries_dict
if args.update:
if key_exists:
bibcode = extract_bibcode(bib.entries_dict[key])
bibcode_new = entry2bibcode(bib.entries_dict[key])
elif key_exists_in_others and args.merge_other:
bibcode = extract_bibcode(bib_other.entries_dict[key])
bibcode_new = entry2bibcode(bib_other.entries_dict[key])
else:
bibcode_new = None
if bibcode_new:
all_entries[bibcode_new].append(key)
if bibcode_new != bibcode or args.force_regenerate:
to_retrieve.add(bibcode_new)
print(
"{}:{} UPDATE => {}".format(
key,
"" if key_exists else " FOUND IN SECONDARY BIB SOURCES,",
bibcode_new,
)
)
return
if key_exists:
print("{}: EXISTING".format(key))
return
if key_exists_in_others and args.merge_other:
bib.entries.append(bib_other.entries_dict[key])
print("{}: FOUND IN OTHER BIB SOURCE, MERGED".format(key))
return
if key_exists_in_others:
print("{}: FOUND IN OTHER BIB SOURCE, IGNORED".format(key))
return
bibcode = id2bibcode(key)
if bibcode:
to_retrieve.add(bibcode)
all_entries[bibcode].append(key)
print("{}: NEW ENTRY => {}".format(key, bibcode))
return
# if all above failed
interactive.add(key)
if args.parallel:
Parallel(n_jobs=args.threads, prefer="threads")(delayed(update)(key) for key in keys)
else:
[update(key) for key in keys]
if interactive:
print(_headerize("Resolving keys that do not contain identifiers..."))
for key in interactive:
bibcode = find_bibcode_interactive(key)
if bibcode:
to_retrieve.add(bibcode)
all_entries[bibcode].append(key)
print("{}: NEW ENTRY => {}".format(key, bibcode))
else:
not_found.add(key)
print("{}: NOT FOUND".format(key))
if not_found:
print(_headerize("Please check the following keys"))
for key in not_found:
print(key)
repeated_keys = [t for t in all_entries.items() if len(t[1]) > 1]
if repeated_keys:
print(_headerize("The following keys refer to the same entry"))
for b, k in repeated_keys:
print(
"{1} has been referred as the following keys; please keep only one:\n{0}\n".format(
" ".join(k), b
)
)
if to_retrieve:
print(_headerize("Building new bibtex file, please wait..."))
bib_new = bibtexparser.loads(
fixedAdsExportQuery(list(to_retrieve), "bibtex").execute(), parser=get_bparser()
)
for entry in bib_new.entries:
print(entry["ID"])
entry["ID"] = all_entries[entry["ID"]][0]
bib = update_bib(bib, bib_new)
bib_dump_str = bibtexparser.dumps(bib).encode("utf8")
if args.backup and os.path.isfile(args.output):
copyfile(args.output, args.output + ".bak")
with open(args.output, "wb") as fp:
fp.write(bib_dump_str)
else:
print('Nothing to write/update.')
print(_headerize("Done!"))
# check version
try:
latest_version = packaging.version.parse(
requests.get(
"https://pypi.python.org/pypi/adstex/json", timeout=0.1,
).json()["info"]["version"]
)
except (requests.RequestException, KeyError, ValueError):
pass
else:
if latest_version > packaging.version.parse(__version__):
msg = "A newer version of adstex (v{}) is now available!\n".format(
latest_version
)
msg += "Please consider updating it by running:\n\n"
msg += "pip install adstex=={}".format(latest_version)
print(_headerize(msg))
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print(_headerize("Abort! adstex interupted by a keyboard signal!"))