-
Notifications
You must be signed in to change notification settings - Fork 0
/
transforms.py
240 lines (193 loc) · 10.3 KB
/
transforms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import random
import torch
import torchvision.transforms.functional as FT
def resize(image, boxes, dims=(300, 300), return_percent_coords=True):
"""
Resize image. For the SSD300, resize to (300, 300).
Since percent/fractional coordinates are calculated for the bounding boxes (w.r.t image dimensions) in this process,
you may choose to retain them.
:param return_percent_coords: if True, bounding boxes are returned as percent coordinates
(w.r.t. resized image dimensions)
:param dims: dimensions to resize to, a tuple (width, height)
:param image: image, a PIL Image
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:return: resized image, updated bounding box coordinates (or fractional coordinates,
in which case they remain the same)
"""
# Resize image
new_image = FT.resize(image, dims)
# Resize bounding boxes
old_dims = torch.FloatTensor([image.width, image.height, image.width, image.height]).unsqueeze(0)
new_boxes = boxes / old_dims # percent coordinates
if not return_percent_coords:
new_dims = torch.FloatTensor([dims[1], dims[0], dims[1], dims[0]]).unsqueeze(0)
new_boxes = new_boxes * new_dims
return new_image, new_boxes
def find_intersection(set_1, set_2):
"""
Find the intersection of every box combination between two sets of boxes that are in boundary coordinates.
:param set_1: set 1, a tensor of dimensions (n1, 4)
:param set_2: set 2, a tensor of dimensions (n2, 4)
:return: intersection of each of the boxes in set 1 with respect to each of the boxes in set 2,
a tensor of dimensions (n1, n2)
"""
# PyTorch auto-broadcasts singleton dimensions
lower_bounds = torch.max(set_1[:, :2].unsqueeze(1), set_2[:, :2].unsqueeze(0)) # (n1, n2, 2)
upper_bounds = torch.min(set_1[:, 2:].unsqueeze(1), set_2[:, 2:].unsqueeze(0)) # (n1, n2, 2)
intersection_dims = torch.clamp(upper_bounds - lower_bounds, min=0) # (n1, n2, 2)
return intersection_dims[:, :, 0] * intersection_dims[:, :, 1] # (n1, n2)
def find_jaccard_overlap(set_1, set_2):
"""
Find the Jaccard Overlap (IoU) of every box combination between two sets of boxes that are in boundary coordinates.
:param set_1: set 1, a tensor of dimensions (n1, 4)
:param set_2: set 2, a tensor of dimensions (n2, 4)
:return: Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2,
a tensor of dimensions (n1, n2)
"""
# Find intersections
intersection = find_intersection(set_1, set_2) # (n1, n2)
# Find areas of each box in both sets
areas_set_1 = (set_1[:, 2] - set_1[:, 0]) * (set_1[:, 3] - set_1[:, 1]) # (n1)
areas_set_2 = (set_2[:, 2] - set_2[:, 0]) * (set_2[:, 3] - set_2[:, 1]) # (n2)
# Find the union
# PyTorch auto-broadcasts singleton dimensions
union = areas_set_1.unsqueeze(1) + areas_set_2.unsqueeze(0) - intersection # (n1, n2)
return intersection / union # (n1, n2)
def random_crop(image, boxes, labels, difficulties):
"""
Performs a random crop in the manner stated in the paper. Helps to learn to detect larger and partial objects.
Note that some objects may be cut out entirely.
Adapted from https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py
:param image: image, a tensor of dimensions (3, original_h, original_w)
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:param labels: labels of objects, a tensor of dimensions (n_objects)
:param difficulties: difficulties of detection of these objects, a tensor of dimensions (n_objects)
:return: cropped image, updated bounding box coordinates, updated labels, updated difficulties
"""
original_h = image.size(1)
original_w = image.size(2)
# Keep choosing a minimum overlap until a successful crop is made
while True:
# Randomly draw the value for minimum overlap
min_overlap = random.choice([0., .1, .3, .5, .7, .9, None]) # 'None' refers to no cropping
# If not cropping
if min_overlap is None:
return image, boxes, labels, difficulties
# Try up to 50 times for this choice of minimum overlap
# This isn't mentioned in the paper, of course, but 50 is chosen in paper authors' original Caffe repo
max_trials = 50
for _ in range(max_trials):
# Crop dimensions must be in [0.3, 1] of original dimensions
# Note - it's [0.1, 1] in the paper, but actually [0.3, 1] in the authors' repo
min_scale = 0.3
scale_h = random.uniform(min_scale, 1)
scale_w = random.uniform(min_scale, 1)
new_h = int(scale_h * original_h)
new_w = int(scale_w * original_w)
# Aspect ratio has to be in [0.5, 2]
aspect_ratio = new_h / new_w
if not 0.5 < aspect_ratio < 2:
continue
# Crop coordinates (origin at top-left of image)
left = random.randint(0, original_w - new_w)
right = left + new_w
top = random.randint(0, original_h - new_h)
bottom = top + new_h
crop = torch.FloatTensor([left, top, right, bottom]) # (4)
# Calculate Jaccard overlap between the crop and the bounding boxes
overlap = find_jaccard_overlap(crop.unsqueeze(0), # (1, 4)
boxes # (n_objects, 4)
) # -> (1, n_objects)
overlap = overlap.squeeze(0) # (n_objects)
# If not a single bounding box has a Jaccard overlap of greater than the minimum, try again
if overlap.max().item() < min_overlap:
continue
# Crop image
new_image = image[:, top:bottom, left:right] # (3, new_h, new_w)
# Find centers of original bounding boxes
bb_centers = (boxes[:, :2] + boxes[:, 2:]) / 2. # (n_objects, 2)
# Find bounding boxes whose centers are in the crop
centers_in_crop = (bb_centers[:, 0] > left) * (bb_centers[:, 0] < right) * (bb_centers[:, 1] > top) *\
(bb_centers[:, 1] < bottom) # (n_objects), a boolean Tensor that can be used for indexing
# If not a single bounding box has its center in the crop, try again
if not centers_in_crop.any():
continue
# Discard bounding boxes that don't meet this criterion
new_boxes = boxes[centers_in_crop, :]
new_labels = labels[centers_in_crop]
new_difficulties = difficulties[centers_in_crop]
# Calculate bounding boxes' new coordinates in the crop
new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2]) # crop[:2] is [left, top]
new_boxes[:, :2] -= crop[:2]
new_boxes[:, 2:] = torch.min(new_boxes[:, 2:], crop[2:]) # crop[2:] is [right, bottom]
new_boxes[:, 2:] -= crop[:2]
return new_image, new_boxes, new_labels, new_difficulties
def photometric_distort(image):
"""
Distort brightness, contrast, saturation, and hue, each with a 50% chance, in random order.
:param image: image, a PIL Image
:return: distorted image
"""
new_image = image
distortions = [FT.adjust_brightness,
FT.adjust_contrast,
FT.adjust_saturation,
FT.adjust_hue]
random.shuffle(distortions)
for d in distortions:
if random.random() < 0.5:
if d.__name__ == 'adjust_hue':
# Caffe repo uses a 'hue_delta' of 18 - we divide by 255 because PyTorch needs a normalized value
adjust_factor = random.uniform(-18 / 255., 18 / 255.)
else:
# Caffe repo uses 'lower' and 'upper' values of 0.5 and 1.5 for brightness, contrast, and saturation
adjust_factor = random.uniform(0.5, 1.5)
# Apply this distortion
new_image = d(new_image, adjust_factor)
return new_image
def flip(image, boxes):
"""
Flip image horizontally.
:param image: image, a PIL Image
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:return: flipped image, updated bounding box coordinates
"""
# Flip image
new_image = FT.hflip(image)
# Flip boxes
new_boxes = boxes
new_boxes[:, 0] = image.width - boxes[:, 0] - 1
new_boxes[:, 2] = image.width - boxes[:, 2] - 1
new_boxes = new_boxes[:, [2, 1, 0, 3]]
return new_image, new_boxes
def expand(image, boxes: torch.Tensor, filler: list) -> tuple:
"""
Perform a zooming out operation by placing the image in a larger canvas of filler material.
Helps to learn to detect smaller objects.
:param image: image, a tensor of dimensions (3, original_h, original_w)
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:param filler: RBG values of the filler material, a list like [R, G, B]
:return: expanded image, updated bounding box coordinates
"""
# Calculate dimensions of proposed expanded (zoomed-out) image
original_h = image.size(1)
original_w = image.size(2)
max_scale = 4
scale = random.uniform(1, max_scale)
new_h = int(scale * original_h)
new_w = int(scale * original_w)
# Create such an image with the filler
filler = torch.FloatTensor(filler) # (3)
new_image = torch.ones((3, new_h, new_w), dtype=torch.float) * filler.unsqueeze(1).unsqueeze(1) # (3, new_h, new_w)
# Note - do not use expand() like new_image = filler.unsqueeze(1).unsqueeze(1).expand(3, new_h, new_w)
# because all expanded values will share the same memory, so changing one pixel will change all
# Place the original image at random coordinates in this new image (origin at top-left of image)
left = random.randint(0, new_w - original_w)
right = left + original_w
top = random.randint(0, new_h - original_h)
bottom = top + original_h
new_image[:, top:bottom, left:right] = image
# Adjust bounding boxes' coordinates accordingly
new_boxes = boxes + torch.FloatTensor([left, top, left, top]).unsqueeze(
0) # (n_objects, 4), n_objects is the no. of objects in this image
return new_image, new_boxes