mmocr/tools/dataset_converters/textrecog/lmdb_converter.py

# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
import os
import os.path as osp

import cv2
import lmdb
import numpy as np

from mmocr.utils import list_from_file


def parse_line(line, format):
    if format == 'txt':
        img_name, text = line.split(' ')
    else:
        line = json.loads(line)
        img_name = line['filename']
        text = line['text']
    return img_name, text


def check_image_is_valid(imageBin):
    if imageBin is None:
        return False
    imageBuf = np.frombuffer(imageBin, dtype=np.uint8)
    img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
    imgH, imgW = img.shape[0], img.shape[1]
    if imgH * imgW == 0:
        return False
    return True


def write_cache(env, cache):
    with env.begin(write=True) as txn:
        cursor = txn.cursor()
        cursor.putmulti(cache, dupdata=False, overwrite=True)


def recog2lmdb(img_root,
               label_path,
               output,
               label_format='txt',
               label_only=False,
               batch_size=1000,
               encoding='utf-8',
               lmdb_map_size=1099511627776,
               verify=True):
    """Create text recognition dataset to LMDB format.

    Args:
        img_root (str): Path to images.
        label_path (str): Path to label file.
        output (str): LMDB output path.
        label_format (str): Format of the label file, either txt or jsonl.
        label_only (bool): Only convert label to lmdb format.
        batch_size (int): Number of files written to the cache each time.
        encoding (str): Label encoding method.
        lmdb_map_size (int): Maximum size database may grow to.
        verify (bool): If true, check the validity of
            every image.Defaults to True.

    E.g.
    This function supports MMOCR's recognition data format and the label file
    can be txt or jsonl, as follows:

        ├──img_root
        |      |—— img1.jpg
        |      |—— img2.jpg
        |      |—— ...
        |——label.txt (or label.jsonl)

        label.txt: img1.jpg HELLO
                   img2.jpg WORLD
                   ...

        label.jsonl: {'filename':'img1.jpg', 'text':'HELLO'}
                     {'filename':'img2.jpg', 'text':'WORLD'}
                     ...
    """
    # check label format
    assert osp.basename(label_path).split('.')[-1] == label_format
    # create lmdb env
    os.makedirs(output, exist_ok=True)
    env = lmdb.open(output, map_size=lmdb_map_size)
    # load label file
    anno_list = list_from_file(label_path, encoding=encoding)
    cache = []
    # index start from 1
    cnt = 1
    n_samples = len(anno_list)
    for anno in anno_list:
        label_key = 'label-%09d'.encode(encoding) % cnt
        img_name, text = parse_line(anno, label_format)
        if label_only:
            # convert only labels to lmdb
            line = json.dumps(
                dict(filename=img_name, text=text), ensure_ascii=False)
            cache.append((label_key, line.encode(encoding)))
        else:
            # convert both images and labels to lmdb
            img_path = osp.join(img_root, img_name)
            if not osp.exists(img_path):
                print('%s does not exist' % img_path)
                continue
            with open(img_path, 'rb') as f:
                image_bin = f.read()
            if verify:
                try:
                    if not check_image_is_valid(image_bin):
                        print('%s is not a valid image' % img_path)
                        continue
                except Exception:
                    print('error occurred at ', img_name)
            image_key = 'image-%09d'.encode(encoding) % cnt
            cache.append((image_key, image_bin))
            cache.append((label_key, text.encode(encoding)))

        if cnt % batch_size == 0:
            write_cache(env, cache)
            cache = []
            print('Written %d / %d' % (cnt, n_samples))
        cnt += 1
    n_samples = cnt - 1
    cache.append(
        ('num-samples'.encode(encoding), str(n_samples).encode(encoding)))
    write_cache(env, cache)
    print('Created lmdb dataset with %d samples' % n_samples)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('label_path', type=str, help='Path to label file')
    parser.add_argument('output', type=str, help='Output lmdb path')
    parser.add_argument(
        '--img-root', '-i', type=str, help='Input imglist path')
    parser.add_argument(
        '--label-only',
        action='store_true',
        help='Only converter label to lmdb')
    parser.add_argument(
        '--label-format',
        '-f',
        default='txt',
        choices=['txt', 'jsonl'],
        help='The format of the label file, either txt or jsonl')
    parser.add_argument(
        '--batch-size',
        '-b',
        type=int,
        default=1000,
        help='Processing batch size, defaults to 1000')
    parser.add_argument(
        '--encoding',
        '-e',
        type=str,
        default='utf8',
        help='Bytes coding scheme, defaults to utf8')
    parser.add_argument(
        '--lmdb-map-size',
        '-m',
        type=int,
        default=1099511627776,
        help='Maximum size database may grow to, '
        'defaults to 1099511627776 bytes (1TB)')
    opt = parser.parse_args()

    assert opt.img_root or opt.label_only
    recog2lmdb(opt.img_root, opt.label_path, opt.output, opt.label_format,
               opt.label_only, opt.batch_size, opt.encoding, opt.lmdb_map_size)


if __name__ == '__main__':
    main()
[Feature] Add recog2lmdb and new toy dataset files (#979) * loss * fix * add img2lmdb and test files * update * add reference * fix lint * fix typo * use total_numer instead to fit mmocr's lmdbloader * reorganize and update * fix lint * update test file * refactor and update * fix test * update doc in tools * fix lint * update old lmdb test file * update * mask the unittest for recog2lmdb and use json format for label_only * remove if __name__ * fix case, doc, typo, formats * fix typos * fix docs and variable names * Apply suggestions from code review Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> * update test_loader.py and fix a bug Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> 2022-04-29 22:30:36 +08:00			`# Copyright (c) OpenMMLab. All rights reserved.`
			`import argparse`
update utils 2022-07-12 19:04:11 +08:00			`import json`
			`import os`
			`import os.path as osp`
[Feature] Add recog2lmdb and new toy dataset files (#979) * loss * fix * add img2lmdb and test files * update * add reference * fix lint * fix typo * use total_numer instead to fit mmocr's lmdbloader * reorganize and update * fix lint * update test file * refactor and update * fix test * update doc in tools * fix lint * update old lmdb test file * update * mask the unittest for recog2lmdb and use json format for label_only * remove if __name__ * fix case, doc, typo, formats * fix typos * fix docs and variable names * Apply suggestions from code review Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> * update test_loader.py and fix a bug Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> 2022-04-29 22:30:36 +08:00
update utils 2022-07-12 19:04:11 +08:00			`import cv2`
			`import lmdb`
			`import numpy as np`

			`from mmocr.utils import list_from_file`


			`def parse_line(line, format):`
			`if format == 'txt':`
			`img_name, text = line.split(' ')`
			`else:`
			`line = json.loads(line)`
			`img_name = line['filename']`
			`text = line['text']`
			`return img_name, text`


			`def check_image_is_valid(imageBin):`
			`if imageBin is None:`
			`return False`
			`imageBuf = np.frombuffer(imageBin, dtype=np.uint8)`
			`img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)`
			`imgH, imgW = img.shape[0], img.shape[1]`
			`if imgH * imgW == 0:`
			`return False`
			`return True`


			`def write_cache(env, cache):`
			`with env.begin(write=True) as txn:`
			`cursor = txn.cursor()`
			`cursor.putmulti(cache, dupdata=False, overwrite=True)`


			`def recog2lmdb(img_root,`
			`label_path,`
			`output,`
			`label_format='txt',`
			`label_only=False,`
			`batch_size=1000,`
			`encoding='utf-8',`
			`lmdb_map_size=1099511627776,`
			`verify=True):`
			`"""Create text recognition dataset to LMDB format.`

			`Args:`
			`img_root (str): Path to images.`
			`label_path (str): Path to label file.`
			`output (str): LMDB output path.`
			`label_format (str): Format of the label file, either txt or jsonl.`
			`label_only (bool): Only convert label to lmdb format.`
			`batch_size (int): Number of files written to the cache each time.`
			`encoding (str): Label encoding method.`
			`lmdb_map_size (int): Maximum size database may grow to.`
			`verify (bool): If true, check the validity of`
			`every image.Defaults to True.`

			`E.g.`
			`This function supports MMOCR's recognition data format and the label file`
			`can be txt or jsonl, as follows:`

			`├──img_root`
			`\| \|—— img1.jpg`
			`\| \|—— img2.jpg`
			`\| \|—— ...`
			`\|——label.txt (or label.jsonl)`

			`label.txt: img1.jpg HELLO`
			`img2.jpg WORLD`
			`...`

			`label.jsonl: {'filename':'img1.jpg', 'text':'HELLO'}`
			`{'filename':'img2.jpg', 'text':'WORLD'}`
			`...`
			`"""`
			`# check label format`
			`assert osp.basename(label_path).split('.')[-1] == label_format`
			`# create lmdb env`
			`os.makedirs(output, exist_ok=True)`
			`env = lmdb.open(output, map_size=lmdb_map_size)`
			`# load label file`
			`anno_list = list_from_file(label_path, encoding=encoding)`
			`cache = []`
			`# index start from 1`
			`cnt = 1`
			`n_samples = len(anno_list)`
			`for anno in anno_list:`
			`label_key = 'label-%09d'.encode(encoding) % cnt`
			`img_name, text = parse_line(anno, label_format)`
			`if label_only:`
			`# convert only labels to lmdb`
			`line = json.dumps(`
			`dict(filename=img_name, text=text), ensure_ascii=False)`
			`cache.append((label_key, line.encode(encoding)))`
			`else:`
			`# convert both images and labels to lmdb`
			`img_path = osp.join(img_root, img_name)`
			`if not osp.exists(img_path):`
			`print('%s does not exist' % img_path)`
			`continue`
			`with open(img_path, 'rb') as f:`
			`image_bin = f.read()`
			`if verify:`
			`try:`
			`if not check_image_is_valid(image_bin):`
			`print('%s is not a valid image' % img_path)`
			`continue`
			`except Exception:`
			`print('error occurred at ', img_name)`
			`image_key = 'image-%09d'.encode(encoding) % cnt`
			`cache.append((image_key, image_bin))`
			`cache.append((label_key, text.encode(encoding)))`

			`if cnt % batch_size == 0:`
			`write_cache(env, cache)`
			`cache = []`
			`print('Written %d / %d' % (cnt, n_samples))`
			`cnt += 1`
			`n_samples = cnt - 1`
			`cache.append(`
			`('num-samples'.encode(encoding), str(n_samples).encode(encoding)))`
			`write_cache(env, cache)`
			`print('Created lmdb dataset with %d samples' % n_samples)`
[Feature] Add recog2lmdb and new toy dataset files (#979) * loss * fix * add img2lmdb and test files * update * add reference * fix lint * fix typo * use total_numer instead to fit mmocr's lmdbloader * reorganize and update * fix lint * update test file * refactor and update * fix test * update doc in tools * fix lint * update old lmdb test file * update * mask the unittest for recog2lmdb and use json format for label_only * remove if __name__ * fix case, doc, typo, formats * fix typos * fix docs and variable names * Apply suggestions from code review Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> * update test_loader.py and fix a bug Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> 2022-04-29 22:30:36 +08:00

			`def main():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('label_path', type=str, help='Path to label file')`
			`parser.add_argument('output', type=str, help='Output lmdb path')`
			`parser.add_argument(`
			`'--img-root', '-i', type=str, help='Input imglist path')`
			`parser.add_argument(`
			`'--label-only',`
			`action='store_true',`
			`help='Only converter label to lmdb')`
			`parser.add_argument(`
			`'--label-format',`
			`'-f',`
			`default='txt',`
			`choices=['txt', 'jsonl'],`
			`help='The format of the label file, either txt or jsonl')`
			`parser.add_argument(`
			`'--batch-size',`
			`'-b',`
			`type=int,`
			`default=1000,`
			`help='Processing batch size, defaults to 1000')`
			`parser.add_argument(`
			`'--encoding',`
			`'-e',`
			`type=str,`
			`default='utf8',`
			`help='Bytes coding scheme, defaults to utf8')`
			`parser.add_argument(`
			`'--lmdb-map-size',`
			`'-m',`
			`type=int,`
[Fix] fix typo of --lmdb-map-size default value (#1147) * fix typo of --lmdb-map-size default value * fix Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> 2022-07-16 21:32:15 +08:00			`default=1099511627776,`
[Feature] Add recog2lmdb and new toy dataset files (#979) * loss * fix * add img2lmdb and test files * update * add reference * fix lint * fix typo * use total_numer instead to fit mmocr's lmdbloader * reorganize and update * fix lint * update test file * refactor and update * fix test * update doc in tools * fix lint * update old lmdb test file * update * mask the unittest for recog2lmdb and use json format for label_only * remove if __name__ * fix case, doc, typo, formats * fix typos * fix docs and variable names * Apply suggestions from code review Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> * update test_loader.py and fix a bug Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> 2022-04-29 22:30:36 +08:00			`help='Maximum size database may grow to, '`
[Fix] fix typo of --lmdb-map-size default value (#1147) * fix typo of --lmdb-map-size default value * fix Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> 2022-07-16 21:32:15 +08:00			`'defaults to 1099511627776 bytes (1TB)')`
[Feature] Add recog2lmdb and new toy dataset files (#979) * loss * fix * add img2lmdb and test files * update * add reference * fix lint * fix typo * use total_numer instead to fit mmocr's lmdbloader * reorganize and update * fix lint * update test file * refactor and update * fix test * update doc in tools * fix lint * update old lmdb test file * update * mask the unittest for recog2lmdb and use json format for label_only * remove if __name__ * fix case, doc, typo, formats * fix typos * fix docs and variable names * Apply suggestions from code review Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> * update test_loader.py and fix a bug Co-authored-by: gaotongxiao <gaotongxiao@gmail.com> Co-authored-by: Xinyu Wang <45810070+xinke-wang@users.noreply.github.com> 2022-04-29 22:30:36 +08:00			`opt = parser.parse_args()`

			`assert opt.img_root or opt.label_only`
			`recog2lmdb(opt.img_root, opt.label_path, opt.output, opt.label_format,`
			`opt.label_only, opt.batch_size, opt.encoding, opt.lmdb_map_size)`


			`if __name__ == '__main__':`
			`main()`