-
Notifications
You must be signed in to change notification settings - Fork 27
/
fetch_papers.py
39 lines (36 loc) · 1.61 KB
/
fetch_papers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
'''
Author: doodhwala
Python3 script to fetch the top 100 papers
'''
import os, re, requests, codecs
directory = 'papers'
if not os.path.exists(directory):
os.makedirs(directory)
papers = []
with codecs.open('README.md', encoding='utf-8', mode='r', buffering=1, errors='strict') as f:
lines = f.read().split('\n')
heading, section_path = '', ''
for line in lines:
if('###' in line):
heading = line.strip().split('###')[1]
win_restricted_chars = re.compile(r'[\^\/\\\:\*\?\"<>\|]')
heading = win_restricted_chars.sub("", heading)
section_path = os.path.join(directory, heading)
if not os.path.exists(section_path):
os.makedirs(section_path)
if('[[pdf]]' in line):
# The stars ensure you pick up only the top 100 papers
# Modify the expression if you want to fetch all other papers as well
result = re.search('\*\*(.*?)\*\*.*?\[\[pdf\]\]\((.*?)\)', line)
if(result):
paper, url = result.groups()
paper = win_restricted_chars.sub("", paper)
# Auto - resume functionality
if(not os.path.exists(os.path.join(section_path, paper + '.pdf'))):
print('Fetching', paper)
try:
response = requests.get(url)
with open(os.path.join(section_path, paper + '.pdf'), 'wb') as f:
f.write(response.content)
except requests.exceptions.RequestException as e:
print("Error: {}".format(e))