[Feature] Add CurvedSyntext150k Converter (#719)

* [Feature] Add bezier_to_polygon to box_util * Add num_sample to parameter * add sort_point util * update docstring * Add curvedsyntext converter
2025-06-03 21:54:47 +08:00 · 2022-03-02 11:02:14 +08:00 · 2022-03-02 11:02:14 +08:00 · ac4462f374
commit ac4462f374
parent 3110ab7863
2 changed files with 158 additions and 0 deletions
--- a/docs/en/datasets/det.md
+++ b/docs/en/datasets/det.md
@ -32,6 +32,10 @@ The structure of the text detection dataset directory is organized as follows.
 │   ├── imgs
 │   ├── instances_test.json
 │   └── instances_training.json
+├── CurvedSynText150k
+│   ├── syntext_word_eng
+│   ├── emcs_imgs
+│   └── instances_training.json
 ```

 |Dataset|Images|                                                                                      |  Annotation Files                                                                                                      |                         |                                                                                                |
@ -43,6 +47,7 @@ The structure of the text detection dataset directory is organized as follows.
 | Synthtext | [homepage](https://www.robots.ox.ac.uk/~vgg/data/scenetext/)  | instances_training.lmdb ([data.mdb](https://download.openmmlab.com/mmocr/data/synthtext/instances_training.lmdb/data.mdb), [lock.mdb](https://download.openmmlab.com/mmocr/data/synthtext/instances_training.lmdb/lock.mdb)) |                    -                    | - |
 | TextOCR | [homepage](https://textvqa.org/textocr/dataset)  | - |                    -                    | -
 | Totaltext | [homepage](https://github.com/cs-chan/Total-Text-Dataset)  | - |                    -                    | -
+| CurvedSynText150k | [homepage](https://github.com/aim-uofa/AdelaiDet/blob/master/datasets/README.md) \| [Part1](https://drive.google.com/file/d/1OSJ-zId2h3t_-I7g_wUkrK-VqQy153Kj/view?usp=sharing) \| [Part2](https://drive.google.com/file/d/1EzkcOlIgEp5wmEubvHb7-J5EImHExYgY/view?usp=sharing) | [instances_training.json](https://download.openmmlab.com/mmocr/data/curvedsyntext/instances_training.json) |        -         |    -    |

 ## Important Note

@ -149,3 +154,27 @@ mv Polygon/Test ../annotations/test
 ```bash
 python tools/data/textdet/totaltext_converter.py /path/to/totaltext -o /path/to/totaltext --split-list training test
 ```
+
+### CurvedSynText150k
+
+- Step1: Download [syntext1.zip](https://drive.google.com/file/d/1OSJ-zId2h3t_-I7g_wUkrK-VqQy153Kj/view?usp=sharing) and [syntext2.zip](https://drive.google.com/file/d/1EzkcOlIgEp5wmEubvHb7-J5EImHExYgY/view?usp=sharing) to `CurvedSynText150k/`.
+- Step2:
+
+```bash
+unzip -q syntext1.zip
+mv train.json train1.json
+unzip images.zip
+rm images.zip
+
+unzip -q syntext2.zip
+mv train.json train2.json
+unzip images.zip
+rm images.zip
+```
+
+- Step3: Download [instances_training.json](https://download.openmmlab.com/mmocr/data/curvedsyntext/instances_training.json) to `CurvedSynText150k/`
+- Or, generate `instances_training.json` with following command:
+
+```bash
+python tools/data/common/curvedsyntext_converter.py PATH/TO/CurvedSynText150k --nproc 4
+```
--- a/tools/data/common/curvedsyntext_converter.py
+++ b/tools/data/common/curvedsyntext_converter.py
@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from functools import partial
+
+import mmcv
+import numpy as np
+
+from mmocr.utils import bezier_to_polygon, sort_points
+
+# The default dictionary used by CurvedSynthText
+dict95 = [
+    ' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.',
+    '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=',
+    '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
+    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[',
+    '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
+    'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
+    'z', '{', '|', '}', '~'
+]
+UNK = len(dict95)
+EOS = UNK + 1
+
+
+def digit2text(rec):
+    res = []
+    for d in rec:
+        assert d <= EOS
+        if d == EOS:
+            break
+        if d == UNK:
+            print('Warning: Has a UNK character')
+            res.append('口')  # Or any special character not in the target dict
+        res.append(dict95[d])
+    return ''.join(res)
+
+
+def modify_annotation(ann, num_sample, start_img_id=0, start_ann_id=0):
+    ann['text'] = digit2text(ann.pop('rec'))
+    # Get hide egmentation points
+    polygon_pts = bezier_to_polygon(ann['bezier_pts'], num_sample=num_sample)
+    ann['segmentation'] = np.asarray(sort_points(polygon_pts)).reshape(
+        1, -1).tolist()
+    ann['image_id'] += start_img_id
+    ann['id'] += start_ann_id
+    return ann
+
+
+def modify_image_info(image_info, path_prefix, start_img_id=0):
+    image_info['file_name'] = osp.join(path_prefix, image_info['file_name'])
+    image_info['id'] += start_img_id
+    return image_info
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert CurvedSynText150k to COCO format')
+    parser.add_argument('root_path', help='CurvedSynText150k  root path')
+    parser.add_argument('-o', '--out-dir', help='Output path')
+    parser.add_argument(
+        '-n',
+        '--num-sample',
+        type=int,
+        default=4,
+        help='Number of sample points at each Bezier curve.')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='Number of processes')
+    args = parser.parse_args()
+    return args
+
+
+def convert_annotations(data,
+                        path_prefix,
+                        num_sample,
+                        nproc,
+                        start_img_id=0,
+                        start_ann_id=0):
+    modify_image_info_with_params = partial(
+        modify_image_info, path_prefix=path_prefix, start_img_id=start_img_id)
+    modify_annotation_with_params = partial(
+        modify_annotation,
+        num_sample=num_sample,
+        start_img_id=start_img_id,
+        start_ann_id=start_ann_id)
+    if nproc > 1:
+        data['annotations'] = mmcv.track_parallel_progress(
+            modify_annotation_with_params, data['annotations'], nproc=nproc)
+        data['images'] = mmcv.track_parallel_progress(
+            modify_image_info_with_params, data['images'], nproc=nproc)
+    else:
+        data['annotations'] = mmcv.track_progress(
+            modify_annotation_with_params, data['annotations'])
+        data['images'] = mmcv.track_progress(
+            modify_image_info_with_params,
+            data['images'],
+        )
+    data['categories'] = [{'id': 1, 'name': 'text'}]
+    return data
+
+
+def main():
+    args = parse_args()
+    root_path = args.root_path
+    out_dir = args.out_dir if args.out_dir else root_path
+    mmcv.mkdir_or_exist(out_dir)
+
+    anns = mmcv.load(osp.join(root_path, 'train1.json'))
+    data1 = convert_annotations(anns, 'syntext_word_eng', args.num_sample,
+                                args.nproc)
+
+    # Get the maximum image id from data1
+    start_img_id = max(data1['images'], key=lambda x: x['id'])['id'] + 1
+    start_ann_id = max(data1['annotations'], key=lambda x: x['id'])['id'] + 1
+    anns = mmcv.load(osp.join(root_path, 'train2.json'))
+    data2 = convert_annotations(
+        anns,
+        'emcs_imgs',
+        args.num_sample,
+        args.nproc,
+        start_img_id=start_img_id,
+        start_ann_id=start_ann_id)
+
+    data1['images'] += data2['images']
+    data1['annotations'] += data2['annotations']
+    mmcv.dump(data1, osp.join(out_dir, 'instances_training.json'))
+
+
+if __name__ == '__main__':
+    main()