-
Notifications
You must be signed in to change notification settings - Fork 1
/
items.py
128 lines (106 loc) · 4.79 KB
/
items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import subprocess
from factories import Factory
import re
import os
import logging
class Item:
default_cache_spec = {
"type": "NullCache"
}
def __init__(self, spec, downloader_factory=Factory("downloaders"), provider_factory=Factory("providers"), cache_factory=Factory("caches")):
self.downloader_factory = downloader_factory
self.provider_factory = provider_factory
self.cache_factory = cache_factory
self.cache = self._init_cache(spec)
self.provider = self._init_provider(spec)
self.downloader = self._init_downloader(spec)
self.name = spec.get("name")
self.dest_dir = spec.get("dest_dir", ".")
self.global_pre_script = self._parse_script(spec.get("global_pre_script"))
self.global_post_script = self._parse_script(spec.get("global_post_script"))
self.pre_download_script = self._parse_script(spec.get("pre_download_script"))
self.post_download_script = self._parse_script(spec.get("post_download_script"))
self.enabled = spec.get("enabled", True)
@staticmethod
def _parse_script(script):
if script is not None and not isinstance(script, list):
return re.split(r"\s+", script)
return script
def _init_downloader(self, spec):
downloader_spec = spec["downloader"]
return self.downloader_factory.create(downloader_spec)
def _init_provider(self, spec):
provider_spec = spec["provider"]
return self.provider_factory.create(provider_spec)
def _init_cache(self, spec):
cache_spec = spec.get("cache", self.default_cache_spec)
return self.cache_factory.create(cache_spec)
def _filter_urls_in_cache(self, urls):
return [url for url in urls if url not in self.cache]
def get_urls_to_download(self):
urls = self.provider.get_urls()
return self._filter_urls_in_cache(urls)
def download_new_elements(self, skip_download=False):
if not self.enabled:
return
try:
urls_to_download = self.get_urls_to_download()
self._run_script(self.global_pre_script)
for url in urls_to_download:
try:
self._run_script(self.pre_download_script, AUTODOWNLOADER_URL=url)
file_name = self.downloader.download(url, self.dest_dir, skip_download)
self._run_script(self.post_download_script, AUTODOWNLOADER_URL=url, AUTODOWNLOADER_FILENAME=file_name)
self.cache.store(url)
self.cache.save()
except Exception as e:
logging.exception(e)
self._run_script(self.global_post_script)
except Exception as e:
logging.exception(e)
def _run_script(self, script, **extra_env):
if script is None:
return
env = self._extend_environment(**extra_env)
script_expanded = self._expand_variables(script, env)
return subprocess.call(script_expanded, cwd=self.dest_dir, env=env)
@staticmethod
def _expand_variables(script, env):
env_copy = os.environ.copy()
os.environ.update(env)
script_expanded = [os.path.expandvars(fragment) for fragment in script]
os.environ = env_copy
return script_expanded
@staticmethod
def _extend_environment(**extra_env):
extended_env = os.environ.copy()
extended_env.update(extra_env)
return extended_env
class LoggingItem(Item):
def __init__(self, spec, *args, **kwargs):
super().__init__(spec, *args, **kwargs)
def download_new_elements(self, skip_download=False):
logging.info("start processing item '{}'".format(self.name))
if not self.enabled:
logging.info("{} is disabled".format(self.name))
super().download_new_elements(skip_download=skip_download)
logging.info("end processing item '{}'".format(self.name))
def _run_script(self, script, **extra_env):
if script is None:
return
script_str = " ".join(script)
logging.info("running script {}".format(script_str))
return_code = super()._run_script(script, **extra_env)
if return_code != 0:
logging.warning("script {} failed, return code was: {}".format(script_str, return_code))
else:
logging.info("script {} terminated with return code 0".format(script_str, return_code))
def _filter_urls_in_cache(self, urls):
logging.info("filtering urls excluding the cached ones...")
urls = super()._filter_urls_in_cache(urls)
if len(urls) == 0:
logging.info("no new urls found: nothing to be done")
else:
logging.info("{} urls remaining after the filtering".format(len(urls)))
logging.debug(urls)
return urls