-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
283 lines (242 loc) · 8.47 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import os
import re
import typing
from pathlib import Path
import gdown
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
from tqdm import tqdm
import docx2txt
import emoji
def clean_text(text: str) ->str:
"""
clean the non ascii characters from the complete autograph text
remove og's quote & name
"""
string_with_nonASCII = text
encoded_string = string_with_nonASCII.encode("ascii", "ignore")
decode_string = encoded_string.decode()
return decode_string
def strip_emoji_and_dots(text: str) -> str:
"""The files have a lot of emojies, and it causes errors.
Currently removing emojies and ... because they cause the
sentence to grow longer
Args:
text (str): input text
Returns:
str: de-emojised text
"""
new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
new_text = new_text.replace("...", ". ")
new_text = new_text.replace("...", ". ")
new_text = new_text.replace(" . ", ". ")
new_text = new_text.replace(" .", ". ")
new_text = new_text.replace("\ufeff", "")
new_text = new_text.replace("\n", " ")
new_text = new_text.replace("'", "")
new_text = new_text.replace(" '", "")
new_text = new_text.replace("' ", "")
new_text = new_text.replace(" ' ", "")
new_text1 = clean_text(new_text)
return new_text1
def get_files_from_gdrive(url: str, fname: str) -> None:
"""downloads file from gdrive
Args:
url (str): gdrive url
fname (str): name of the file
"""
file_id = url.split("/")[3].split("?")[1]
url = f"https://drive.google.com/uc?{file_id}"
gdown.download(url, fname, quiet=True)
def get_autos(
filepath: str = "YearbookENTC",
details_file: str = "docs/details.csv",
download_image: bool = True,
) -> list:
"""Returns a list of dictionaries of autographs for each person.
Args:
filepath (str, optional): Dir path to all the autographs. Defaults to "YearbookENTC".
details_file (str, optional): details for name and quotes and photos Defaults to "docs/details.csv".
download_image (bool, optional): do you want to download the images again. Defaults to True.
Returns:
dict: autos dict
"""
df = clean_details(details_file)
autos = []
filepath = Path(filepath)
file_list = file_list_from_dir(filepath)
for f in tqdm(file_list):
details = {}
name = str(f)[len(str(filepath)) + 1 :]
if name in list(df.index):
details["Name"] = extract_full_name(df, name)
details["Quote"] = extract_quote(df, name)
yearbook_image = df.loc[name]["Year Book Image"]
yearbook_image_filename = f"src/static/{df.loc[name]['filename of your image (With extension .jpg or .png)']}"
if download_image:
get_files_from_gdrive(yearbook_image, yearbook_image_filename)
details["Image"] = yearbook_image_filename
details["flask_image"] = f"{df.loc[name]['filename of your image (With extension .jpg or .png)']}"
details["autographs"] = {}
else:
# print(f"something is wrong with {name}")
continue
for x in f.iterdir():
path_to_persons_files = f"{str(filepath)}\{name}\{name}"
if not (str(x) == f"{path_to_persons_files}.txt") and not (
str(x) == f"{path_to_persons_files}.jpg"
or str(x) == f"{path_to_persons_files}.png"
):
output, pname = extract_autographs_and_pname(filepath, name, x, df)
#if()
details["autographs"][pname] = output
autos.append(details)
return autos
def extract_quote(df: pd.DataFrame, name: str) -> str:
"""Extracts the quote of the person from dataframe
Args:
df (pd.DataFrame): details dataframe
name (str): the person's name who's quote you're querying
Returns:
str: the yearbook quote of the person
"""
return strip_emoji_and_dots(str(df.loc[name]["Quote for yearbook"]))
def extract_full_name(df: str, name: str) -> str:
"""Based on the queryname returns full name with space.
Args:
df (pd.DataFrame): details dataframe
name (str): the person's name who's quote you're querying
Returns:
str: the yearbook quote of the person
"""
return df.loc[name]["First Name"] + " " + df.loc[name]["Last Name"]
def extract_autographs_and_pname(filepath, name, x, df):
"""extracts autographs from folder and the name of the person writing the autograph
Args:
filepath : path to the main dir where all autograph dirs are there
name : queryname of the person
x : Pathlib file to the file in concern
df : details dataframe
Returns:
output : the autogaph of person
pname : name of the person who wrote that autograph
"""
try:
f = check_for_txt_docx(x)
f = strip_emoji_and_dots(f)
output = split_paragraph(f, 10)
except:
output = "input error due to try block 1"
try:
f = docx2txt.process(x)
f = strip_emoji_and_dots(f)
output = split_paragraph(f, 10)
except:
output = "Input Error due to try block 2"
# print(f"Input Error for {name}")
# print(f"Input Error file name {str(x)}")
try:
l = len(str(filepath)) + len(name) + 2
if str(x).lower()[l : l + 9] == "autograph":
pname = extract_name(x, df, l)
pname = extract_full_name(df, str(x)[l + 10 : -4])
except:
pname = f"{str(x)[l+10:-4]}\n"
return output, pname
def extract_name(x, df, l):
"""[summary]
Args:
x : Pathlib file to the file in concern
df : details datafame
l : length of the string till "autograph" begins
Returns:
str: name of the peron (firstname lastname)
"""
if str(x)[-4:] == ".txt":
pname = extract_full_name(df, str(x)[l + 10 : -4])
elif str(x)[-9:] == ".txt.docx":
pname = extract_full_name(df, str(x)[l + 10 : -9])
elif str(x)[-4:] == "docx":
pname = extract_full_name(df, str(x)[l + 10 : -5])
elif str(x)[-4:] == "gdoc":
pname = extract_full_name(df, str(x)[l + 10 : -5])
elif str(x)[-9:] == ".txt.gdoc":
pname = extract_full_name(df, str(x)[l + 10 : -9])
elif str(x)[-5:] == "..txt":
pname = extract_full_name(df, str(x)[l + 10 : -5])
return pname
def check_for_txt_docx(x):
if str(x)[-4:] == "docx":
f = docx2txt.process(str(x))
elif str(x)[-4:] == ".txt":
f = open(x, "r").read()
return f
def file_list_from_dir(filepath):
assert filepath.is_dir()
file_list = []
for x in filepath.iterdir():
if x.is_dir():
file_list.append(x)
return file_list
def clean_details(details_file):
df = pd.read_csv(details_file)
df["query_name"] = df["First Name"] + df["Last Name"]
df["query_name"] = df["query_name"].apply(lambda x: x.lower())
df.set_index("query_name", inplace=True)
return df
def get_display_img(imgpath):
if Path(imgpath).is_file():
return str(imgpath)
else:
return "unknown.png"
def add_image(autos, plt, sno):
try:
imgpath = autos[sno]["Image"]
img = mpimg.imread(get_display_img(imgpath))
plt.imshow(img)
except:
img = mpimg.imread("unknown.png")
plt.imshow(img)
plt.axis("off")
def add_quote(autos, plt, sno):
try:
output = autos[sno]["Quote"]
output = split_paragraph(output, 4)
except:
output = "Wrong formatting for quote"
plt.text(
0.5,
0.5,
output,
horizontalalignment="center",
verticalalignment="center",
fontdict={
"family": "serif",
# "color": "#f0bc81",
"weight": "normal",
"size": 20,
},
)
try:
title = autos[sno]["Name"]
except:
title = "error"
plt.title(
title,
fontdict={
"family": "serif",
# "color": "#f0bc81",
"weight": "normal",
"size": 24,
},
)
plt.axis("off")
def split_paragraph(para, n):
"""Returns a string that's sliced after n words.
Input -> string, n->after n words, adding a \n.
"""
res = para.split()
ans = [" ".join(res[i : i + n]) for i in range(0, len(res), n)]
return "\n".join(ans)