forked from pascalweiss/LSFEventScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
LSFEventParser.py
105 lines (86 loc) · 4.1 KB
/
LSFEventParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from LSFEventType import LSFEventType
__author__ = 'pascal'
from datetime import datetime
from bs4 import BeautifulSoup
from LSFEvent import LSFEvent
from LSFTextUtils import LSFTextUtils
class LSFEventParser:
def __init__(self, html, event_type=LSFEventType.normal_event):
self.dom = BeautifulSoup(html, from_encoding='utf-8')
self._event_type = event_type
def extract_events(self):
events = []
date_str = self.extract_date()
print(date_str)
rows = self.extract_rows()
row_count = 0
for row in rows:
if self._event_type == LSFEventType.normal_event:
try:
event = LSFEventParser.extract_normal_event(row, date_str)
events.append(event)
except Exception as e:
print('Exception: ' + date_str)
print(e)
else:
try:
event = LSFEventParser.extract_cancelled_event(row, date_str)
events.append(event)
except Exception as e:
print(e)
# try:
# event = LSFEventParser.convert_html_row(row, date_str)
# events.append(event)
# except:
# failure_string = 'Failure on ' + date_str + ' at row ' + str(row_count)
# print(failure_string)
# LSFLogging.log_failure(failure_string)
row_count += 1
return events
def extract_date(self):
date_tag = self.dom.find('td', 'hd_darkgreen')
if self._event_type is LSFEventType.normal_event:
return date_tag.text[29:39]
else:
return date_tag.text[41:51]
def extract_rows(self):
inside = self.dom.find('table', 'inside')
tr = inside.find_all('tr')
rows = tr[7:len(tr)-1]
return rows
@staticmethod
def extract_normal_event(row, date_str=''):
event = LSFEvent()
td_tags = row.find_all('td')
if len(td_tags) != 0:
campus_and_building = LSFTextUtils.split_string_at_nth_space(td_tags[4].text, 1)
begin_str = LSFTextUtils.correct_time_string(LSFTextUtils.remove_spaces(td_tags[0].text))
end_str = LSFTextUtils.correct_time_string(td_tags[1].text)
event.begin = datetime.strptime(date_str + ' ' + begin_str, '%d.%m.%Y %H:%M')
event.end = datetime.strptime(date_str + ' ' + end_str, '%d.%m.%Y %H:%M')
event.id = td_tags[2].text
event.title = LSFTextUtils.remove_new_line_and_tab(td_tags[3].text)
event.event_link = td_tags[3].find('a')['href']
event.campus = LSFTextUtils.remove_spaces_at_end(campus_and_building[0])
event.building = LSFTextUtils.rename_TGS(campus_and_building[1])
event.room = LSFTextUtils.remove_new_line_and_tab(LSFTextUtils.split_string_at_last_space(td_tags[5].text)[1])
event.room_link = td_tags[5].find('a')['href']
event.student_group = LSFTextUtils.remove_spaces_at_beginning(td_tags[6].text)
event.lecturer = LSFTextUtils.remove_spaces_at_beginning(td_tags[7].text)
return event
@staticmethod
def extract_cancelled_event(row, date_str=''):
event = LSFEvent()
td_tags = row.find_all('td')
if len(td_tags) != 0:
begin_str = LSFTextUtils.correct_time_string(LSFTextUtils.remove_spaces(td_tags[0].text))
end_str = LSFTextUtils.correct_time_string(td_tags[1].text)
event.begin = datetime.strptime(date_str + ' ' + begin_str, '%d.%m.%Y %H:%M')
event.end = datetime.strptime(date_str + ' ' + end_str, '%d.%m.%Y %H:%M')
event.id = td_tags[2].text
event.title = LSFTextUtils.remove_new_line_and_tab(td_tags[3].text)
event.event_link = td_tags[3].find('a')['href']
event.student_group = LSFTextUtils.remove_spaces_at_beginning(td_tags[4].text)
event.lecturer = LSFTextUtils.remove_spaces_at_beginning(td_tags[5].text)
event.cancel_note = td_tags[6].text
return event