Optimize prediction on long image and deduplicate similar boxes with multiple lables (#11366)

* Handle conflict where a box is simultaneously recognized as multiple labels

* Split large height image recursively and process each with overlap to enhance performance

* Fix error when dt_box result is empty

* Add split operation on horizon side

* Slide on horizon may suffer line completeness, so that add more strict condition.

* Optimize recognition of overlap boxes.
pull/11416/head
marswen 2023-12-21 10:32:42 +08:00 committed by GitHub
parent c708180ce9
commit 0382bfb02d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 107 additions and 2 deletions

View File

@ -78,6 +78,24 @@ def area_of(left_top, right_bottom):
return hw[..., 0] * hw[..., 1]
def calculate_containment(boxes0, boxes1):
"""
Calculate the containment of the boxes.
Args:
boxes0 (N, 4): ground truth boxes.
boxes1 (N or 1, 4): predicted boxes.
Returns:
containment (N): containment values.
"""
overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
overlap_area = area_of(overlap_left_top, overlap_right_bottom)
area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
return overlap_area / np.minimum(area0, np.expand_dims(area1, axis=0))
class PicoDetPostProcess(object):
"""
Args:
@ -245,6 +263,24 @@ class PicoDetPostProcess(object):
for dt in out_boxes_list:
clsid, bbox, score = int(dt[0]), dt[2:], dt[1]
label = self.labels[clsid]
result = {'bbox': bbox, 'label': label}
result = {'bbox': bbox, 'label': label, 'score': score}
results.append(result)
# Handle conflict where a box is simultaneously recognized as multiple labels.
# Use IoU to find similar boxes. Prioritize labels as table, text, and others when deduplicate similar boxes.
bboxes = np.array([x['bbox'] for x in results])
duplicate_idx = list()
for i in range(len(results)):
if i in duplicate_idx:
continue
containments = calculate_containment(bboxes, bboxes[i, ...])
overlaps = np.where(containments > 0.5)[0]
if len(overlaps) > 1:
table_box = [x for x in overlaps if results[x]['label'] == 'table']
if len(table_box) > 0:
keep = sorted([(x, results[x]) for x in table_box], key=lambda x: x[1]['score'], reverse=True)[0][0]
else:
keep = sorted([(x, results[x]) for x in overlaps], key=lambda x: x[1]['score'], reverse=True)[0][0]
duplicate_idx.extend([x for x in overlaps if x != keep])
results = [x for i, x in enumerate(results) if i not in duplicate_idx]
return results

View File

@ -217,7 +217,7 @@ class TextDetector(object):
dt_boxes = np.array(dt_boxes_new)
return dt_boxes
def __call__(self, img):
def predict(self, img):
ori_im = img.copy()
data = {'image': img}
@ -283,6 +283,75 @@ class TextDetector(object):
et = time.time()
return dt_boxes, et - st
def __call__(self, img):
# For image like poster with one side much greater than the other side,
# splitting recursively and processing with overlap to enhance performance.
MIN_BOUND_DISTANCE = 50
dt_boxes = np.zeros((0, 4, 2), dtype=np.float32)
elapse = 0
if img.shape[0] / img.shape[1] > 2 and img.shape[0] > self.args.det_limit_side_len:
start_h = 0
end_h = 0
while end_h <= img.shape[0]:
end_h = start_h + img.shape[1] * 3 // 4
subimg = img[start_h: end_h, :]
if len(subimg) == 0:
break
sub_dt_boxes, sub_elapse = self.predict(subimg)
offset = start_h
# To prevent text blocks from being cut off, roll back a certain buffer area.
if len(sub_dt_boxes) == 0 or img.shape[1] - max([x[-1][1] for x in sub_dt_boxes]) > MIN_BOUND_DISTANCE:
start_h = end_h
else:
sorted_indices = np.argsort(sub_dt_boxes[:, 2, 1])
sub_dt_boxes = sub_dt_boxes[sorted_indices]
bottom_line = 0 if len(sub_dt_boxes) <= 1 else int(np.max(sub_dt_boxes[:-1, 2, 1]))
if bottom_line > 0:
start_h += bottom_line
sub_dt_boxes = sub_dt_boxes[sub_dt_boxes[:, 2, 1] <= bottom_line]
else:
start_h = end_h
if len(sub_dt_boxes) > 0:
if dt_boxes.shape[0] == 0:
dt_boxes = sub_dt_boxes + np.array([0, offset], dtype=np.float32)
else:
dt_boxes = np.append(dt_boxes,
sub_dt_boxes + np.array([0, offset], dtype=np.float32),
axis=0)
elapse += sub_elapse
elif img.shape[1] / img.shape[0] > 3 and img.shape[1] > self.args.det_limit_side_len * 3:
start_w = 0
end_w = 0
while end_w <= img.shape[1]:
end_w = start_w + img.shape[0] * 3 // 4
subimg = img[:, start_w: end_w]
if len(subimg) == 0:
break
sub_dt_boxes, sub_elapse = self.predict(subimg)
offset = start_w
if len(sub_dt_boxes) == 0 or img.shape[0] - max([x[-1][0] for x in sub_dt_boxes]) > MIN_BOUND_DISTANCE:
start_w = end_w
else:
sorted_indices = np.argsort(sub_dt_boxes[:, 2, 0])
sub_dt_boxes = sub_dt_boxes[sorted_indices]
right_line = 0 if len(sub_dt_boxes) <= 1 else int(np.max(sub_dt_boxes[:-1, 1, 0]))
if right_line > 0:
start_w += right_line
sub_dt_boxes = sub_dt_boxes[sub_dt_boxes[:, 1, 0] <= right_line]
else:
start_w = end_w
if len(sub_dt_boxes) > 0:
if dt_boxes.shape[0] == 0:
dt_boxes = sub_dt_boxes + np.array([offset, 0], dtype=np.float32)
else:
dt_boxes = np.append(dt_boxes,
sub_dt_boxes + np.array([offset, 0], dtype=np.float32),
axis=0)
elapse += sub_elapse
else:
dt_boxes, elapse = self.predict(img)
return dt_boxes, elapse
if __name__ == "__main__":
args = utility.parse_args()