[Feature] Add synthtext converter and update docs (#351)

* Add synthtext converter and update docs * minor docs fix
2021-07-07 15:54:29 +08:00 · 2021-07-07 15:54:29 +08:00 · 68df4fbe80
parent 4fcff1f613
commit 68df4fbe80
2 changed files with 155 additions and 6 deletions
--- a/docs/datasets.md
+++ b/docs/datasets.md
@ -267,22 +267,28 @@ The structure of the text detection dataset directory is organized as follows.

 - For `SynthText`:
  - Step1: Download `SynthText.zip` from [homepage](https://www.robots.ox.ac.uk/~vgg/data/scenetext/)
-  - Step2: Download [shuffle_labels.txt](https://download.openmmlab.com/mmocr/data/mixture/SynthText/shuffle_labels.txt)
-  - Step3: Download [instances_train.txt](https://download.openmmlab.com/mmocr/data/mixture/SynthText/instances_train.txt)
-  - Step4:
+  - Step2:

  ```bash
+  mkdir SynthText && cd SynthText
+  mv /path/to/SynthText.zip .
  unzip SynthText.zip
-
-  cd SynthText
+  mv SynthText synthtext

  mv /path/to/shuffle_labels.txt .

  # create soft link
  cd /path/to/mmocr/data/mixture
-
  ln -s /path/to/SynthText SynthText
  ```
+  - Step3:
+  Generate cropped images and labels:
+
+  ```bash
+  cd /path/to/mmocr
+
+  python tools/data/textrecog/synthtext_converter.py data/mixture/SynthText/gt.mat data/mixture/SynthText/ data/mixture/SynthText/synthtext/SynthText_patch_horizontal --n_proc 8
+  ```

 - For `SynthAdd`:
  - Step1: Download `SynthText_Add.zip` from [SynthAdd](https://pan.baidu.com/s/1uV0LtoNmcxbO-0YA7Ch4dg) (code:627x))
--- a/tools/data/textrecog/synthtext_converter.py
+++ b/tools/data/textrecog/synthtext_converter.py
@ -0,0 +1,143 @@
+import argparse
+import os
+from functools import partial
+
+import mmcv
+import numpy as np
+from scipy.io import loadmat
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Crop images in Synthtext-style dataset in '
+        'prepration for MMOCR\'s use')
+    parser.add_argument(
+        'anno_path', help='Path to gold annotation data (gt.mat)')
+    parser.add_argument('img_path', help='Path to images')
+    parser.add_argument('out_dir', help='Path of output images and labels')
+    parser.add_argument(
+        '--n_proc',
+        default=1,
+        type=int,
+        help='Number of processes to run with')
+    args = parser.parse_args()
+    return args
+
+
+def load_gt_datum(datum):
+    img_path, txt, wordBB, charBB = datum
+    words = []
+    word_bboxes = []
+    char_bboxes = []
+
+    # when there's only one word in txt
+    # scipy will load it as a string
+    if type(txt) is str:
+        words = txt.split()
+    else:
+        for line in txt:
+            words += line.split()
+
+    # From (2, 4, num_boxes) to (num_boxes, 4, 2)
+    if len(wordBB.shape) == 2:
+        wordBB = wordBB[:, :, np.newaxis]
+    cur_wordBB = wordBB.transpose(2, 1, 0)
+    for box in cur_wordBB:
+        word_bboxes.append(
+            [max(round(coord), 0) for pt in box for coord in pt])
+
+    # Validate word bboxes.
+    if len(words) != len(word_bboxes):
+        return
+
+    # From (2, 4, num_boxes) to (num_boxes, 4, 2)
+    cur_charBB = charBB.transpose(2, 1, 0)
+    for box in cur_charBB:
+        char_bboxes.append(
+            [max(round(coord), 0) for pt in box for coord in pt])
+
+    char_bbox_idx = 0
+    char_bbox_grps = []
+
+    for word in words:
+        temp_bbox = char_bboxes[char_bbox_idx:char_bbox_idx + len(word)]
+        char_bbox_idx += len(word)
+        char_bbox_grps.append(temp_bbox)
+
+    # Validate char bboxes.
+    # If the length of the last char bbox is correct, then
+    # all the previous bboxes are also valid
+    if len(char_bbox_grps[len(words) - 1]) != len(words[-1]):
+        return
+
+    return img_path, words, word_bboxes, char_bbox_grps
+
+
+def load_gt_data(filename, n_proc):
+    mat_data = loadmat(filename, simplify_cells=True)
+    imnames = mat_data['imnames']
+    txt = mat_data['txt']
+    wordBB = mat_data['wordBB']
+    charBB = mat_data['charBB']
+    return mmcv.track_parallel_progress(
+        load_gt_datum, list(zip(imnames, txt, wordBB, charBB)), nproc=n_proc)
+
+
+def process(data, img_path_prefix, out_dir):
+    if data is None:
+        return
+    # Dirty hack for multi-processing
+    img_path, words, word_bboxes, char_bbox_grps = data
+    img_dir, img_name = os.path.split(img_path)
+    img_name = os.path.splitext(img_name)[0]
+    input_img = mmcv.imread(os.path.join(img_path_prefix, img_path))
+
+    output_sub_dir = os.path.join(out_dir, img_dir)
+    if not os.path.exists(output_sub_dir):
+        try:
+            os.makedirs(output_sub_dir)
+        except FileExistsError:
+            pass  # occurs when multi-proessing
+
+    for i, word in enumerate(words):
+        output_image_patch_name = f'{img_name}_{i}.png'
+        output_label_name = f'{img_name}_{i}.txt'
+        output_image_patch_path = os.path.join(output_sub_dir,
+                                               output_image_patch_name)
+        output_label_path = os.path.join(output_sub_dir, output_label_name)
+        if os.path.exists(output_image_patch_path) and os.path.exists(
+                output_label_path):
+            continue
+
+        word_bbox = word_bboxes[i]
+        min_x, max_x = min(word_bbox[::2]), max(word_bbox[::2])
+        min_y, max_y = min(word_bbox[1::2]), max(word_bbox[1::2])
+        cropped_img = input_img[min_y:max_y, min_x:max_x]
+        if cropped_img.shape[0] <= 0 or cropped_img.shape[1] <= 0:
+            continue
+
+        char_bbox_grp = np.array(char_bbox_grps[i])
+        char_bbox_grp[:, ::2] -= min_x
+        char_bbox_grp[:, 1::2] -= min_y
+
+        mmcv.imwrite(cropped_img, output_image_patch_path)
+        with open(output_label_path, 'w') as output_label_file:
+            output_label_file.write(word + '\n')
+            for cbox in char_bbox_grp:
+                output_label_file.write('%d %d %d %d %d %d %d %d\n' %
+                                        tuple(cbox.tolist()))
+
+
+def main():
+    args = parse_args()
+    print('Loading annoataion data...')
+    data = load_gt_data(args.anno_path, args.n_proc)
+    process_with_outdir = partial(
+        process, img_path_prefix=args.img_path, out_dir=args.out_dir)
+    print('Creating cropped images and gold labels...')
+    mmcv.track_parallel_progress(process_with_outdir, data, nproc=args.n_proc)
+    print('Done')
+
+
+if __name__ == '__main__':
+    main()