Add RecogDatasets

2022-07-04 04:06:21 +00:00 · 2022-07-04 04:06:21 +00:00 · 83ec5726d6
parent b955df9904
commit 83ec5726d6
16 changed files with 506 additions and 4 deletions
--- a/.dev_scripts/covignore.cfg
+++ b/.dev_scripts/covignore.cfg
@ -27,5 +27,8 @@ mmocr/models/textdet/postprocessors/utils.py
 # It will be removed after all models have been refactored
 mmocr/utils/ocr.py

+# It will be deleted
+mmocr/datasets/utils/backend.py
+mmocr/datasets/utils/loader.py
 # It will be removed after TTA refactor
 mmocr/datasets/pipelines/test_time_aug.py
--- a/mmocr/datasets/init.py
+++ b/mmocr/datasets/init.py
@ -4,6 +4,11 @@ from .icdar_dataset import IcdarDataset
 from .ocr_dataset import OCRDataset
 from .ocr_seg_dataset import OCRSegDataset
 from .pipelines import *  # NOQA
+from .recog_lmdb_dataset import RecogLMDBDataset
+from .recog_text_dataset import RecogTextDataset
 from .utils import *  # NOQA

-__all__ = ['IcdarDataset', 'OCRDataset', 'OCRSegDataset', 'PARSERS', 'LOADERS']
+__all__ = [
+    'IcdarDataset', 'OCRDataset', 'OCRSegDataset', 'PARSERS', 'LOADERS',
+    'RecogLMDBDataset', 'RecogTextDataset'
+]
--- a/mmocr/datasets/recog_lmdb_dataset.py
+++ b/mmocr/datasets/recog_lmdb_dataset.py
@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import warnings
+from typing import Callable, List, Optional, Sequence, Union
+
+from mmengine.dataset import BaseDataset
+
+from mmocr.registry import DATASETS, TASK_UTILS
+
+
+@DATASETS.register_module()
+class RecogLMDBDataset(BaseDataset):
+    r"""RecogLMDBDataset for text recognition.
+
+    The annotation format should be in lmdb format. We support two lmdb
+    formats, one is the lmdb file with only labels generated by txt2lmdb
+    (deprecated), and another one is the lmdb file generated by recog2lmdb.
+
+    The former format stores string in `filename text` format directly in lmdb,
+    while the latter uses `image_key` as well as `label_key` for querying.
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        parse_cfg (dict, optional): Config of parser for parsing annotations.
+            Use ``LineJsonParser`` when the annotation file is in jsonl format
+            with keys of ``filename`` and ``text``. The keys in parse_cfg
+            should be consistent with the keys in jsonl annotations. The first
+            key in parse_cfg should be the key of the path in jsonl
+            annotations. The second key in parse_cfg should be the key of the
+            text in jsonl Use ``LineStrParser`` when the annotation file is in
+            txt format. Defaults to
+            ``dict(type='LineJsonParser', keys=['filename', 'text'])``.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_root (str): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to ''.
+        data_prefix (dict): Prefix for training data. Defaults to
+            ``dict(img_path='')``.
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``RecogLMDBDataset`` can skip load
+            annotations to save time by set ``lazy_init=False``.
+            Defaults to False.
+        max_refetch (int, optional): If ``RecogLMDBdataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+    """
+
+    def __init__(self,
+                 ann_file: str = '',
+                 parser_cfg: Optional[dict] = dict(
+                     type='LineJsonParser', keys=['filename', 'text']),
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = '',
+                 data_prefix: dict = dict(img_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000) -> None:
+        if parser_cfg['type'] != 'LineJsonParser':
+            raise ValueError('We only support using LineJsonParser '
+                             'to parse lmdb file. Please use LineJsonParser '
+                             'in the dataset config')
+        self.parser = TASK_UTILS.build(parser_cfg)
+        self.ann_file = ann_file
+        self.deprecated_format = False
+        env = self._get_env()
+        with env.begin(write=False) as txn:
+            try:
+                self.total_number = int(
+                    txn.get(b'num-samples').decode('utf-8'))
+            except AttributeError:
+                warnings.warn(
+                    'DeprecationWarning: The lmdb dataset generated with '
+                    'txt2lmdb will be deprecate, please use the latest '
+                    'tools/data/utils/recog2lmdb to generate lmdb dataset. '
+                    'See https://mmocr.readthedocs.io/en/latest/tools.html#'
+                    'convert-text-recognition-dataset-to-lmdb-format for '
+                    'details.', UserWarning)
+                self.total_number = int(
+                    txn.get(b'total_number').decode('utf-8'))
+                self.deprecated_format = True
+            # The lmdb file may contain only the label, or it may contain both
+            # the label and the image, so we use image_key here for probing.
+            image_key = f'image-{1:09d}'
+            if txn.get(image_key.encode('utf-8')) is None:
+                self.label_only = True
+            else:
+                self.label_only = False
+
+        super().__init__(
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            filter_cfg=filter_cfg,
+            indices=indices,
+            serialize_data=serialize_data,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            lazy_init=lazy_init,
+            max_refetch=max_refetch)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """
+        if not hasattr(self, 'env'):
+            self.env = self._get_env()
+
+        data_list = []
+        with self.env.begin(write=False) as txn:
+            for i in range(self.total_number):
+                if self.deprecated_format:
+                    line = txn.get(str(i).encode('utf-8')).decode('utf-8')
+                    filename, text = line.strip('/n').split(' ')
+                    line = json.dumps(
+                        dict(filename=filename, text=text), ensure_ascii=False)
+                else:
+                    i = i + 1
+                    label_key = f'label-{i:09d}'
+                    if self.label_only:
+                        line = txn.get(
+                            label_key.encode('utf-8')).decode('utf-8')
+                    else:
+                        img_key = f'image-{i:09d}'
+                        text = txn.get(
+                            label_key.encode('utf-8')).decode('utf-8')
+                        line = json.dumps(
+                            dict(filename=img_key, text=text),
+                            ensure_ascii=False)
+                data_list.append(self.parse_data_info(line))
+        return data_list
+
+    def parse_data_info(self, raw_anno_info: str) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_anno_info (str): One raw data information loaded
+                from ``ann_file``.
+
+        Returns:
+            (dict): Parsed annotation.
+        """
+        data_info = {}
+        parsed_anno = self.parser(raw_anno_info)
+        img_path = osp.join(self.data_prefix['img_path'],
+                            parsed_anno[self.parser.keys[0]])
+
+        data_info['img_path'] = img_path
+        data_info['instances'] = [dict(text=parsed_anno[self.parser.keys[1]])]
+        return data_info
+
+    def _get_env(self):
+        """Get lmdb environment from self.ann_file.
+
+        Returns:
+            Lmdb environment.
+        """
+        try:
+            import lmdb
+        except ImportError:
+            raise ImportError(
+                'Please install lmdb to enable RecogLMDBDataset.')
+        return lmdb.open(
+            self.ann_file,
+            max_readers=1,
+            readonly=True,
+            lock=False,
+            readahead=False,
+            meminit=False,
+        )
--- a/mmocr/datasets/recog_text_dataset.py
+++ b/mmocr/datasets/recog_text_dataset.py
@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, List, Optional, Sequence, Union
+
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import list_from_file
+
+from mmocr.registry import DATASETS, TASK_UTILS
+
+# TODO: replace all list_from_file from mmengine
+
+
+@DATASETS.register_module()
+class RecogTextDataset(BaseDataset):
+    r"""RecogTextDataset for text recognition.
+
+    The annotation format can be both in jsonl and txt. If the annotation file
+    is in jsonl format, it should be a list of dicts. If the annotation file
+    is in txt format, it should be a list of lines.
+
+    The annotation formats are shown as follows.
+    - txt format
+    .. code-block:: none
+
+        ``test_img1.jpg OpenMMLab``
+        ``test_img2.jpg MMOCR``
+
+    - jsonl format
+    .. code-block:: none
+
+        ``{"filename": "test_img1.jpg", "text": "OpenMMLab"}``
+        ``{"filename": "test_img2.jpg", "text": "MMOCR"}``
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Default: None.
+        parse_cfg (dict, optional): Config of parser for parsing annotations.
+            Use ``LineJsonParser`` when the annotation file is in jsonl format
+            with keys of ``filename`` and ``text``. The keys in parse_cfg
+            should be consistent with the keys in jsonl annotations. The first
+            key in parse_cfg should be the key of the path in jsonl
+            annotations. The second key in parse_cfg should be the key of the
+            text in jsonl Use ``LineStrParser`` when the annotation file is in
+            txt format. Defaults to
+            ``dict(type='LineJsonParser', keys=['filename', 'text'])``.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_root (str): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to ''.
+        data_prefix (dict): Prefix for training data. Defaults to
+            ``dict(img_path='')``.
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``RecogTextDataset`` can skip load
+            annotations to save time by set ``lazy_init=False``. Defaults to
+            False.
+        max_refetch (int, optional): If ``RecogTextDataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+    """
+
+    def __init__(self,
+                 ann_file: str = '',
+                 file_client_args=None,
+                 parser_cfg: Optional[dict] = dict(
+                     type='LineJsonParser', keys=['filename', 'text']),
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = '',
+                 data_prefix: dict = dict(img_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000) -> None:
+
+        self.parser = TASK_UTILS.build(parser_cfg)
+        self.file_client_args = file_client_args
+        super().__init__(
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            filter_cfg=filter_cfg,
+            indices=indices,
+            serialize_data=serialize_data,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            lazy_init=lazy_init,
+            max_refetch=max_refetch)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """
+        data_list = []
+        raw_anno_infos = list_from_file(
+            self.ann_file, file_client_args=self.file_client_args)
+        for raw_anno_info in raw_anno_infos:
+            data_list.append(self.parse_data_info(raw_anno_info))
+        return data_list
+
+    def parse_data_info(self, raw_anno_info: str) -> dict:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_anno_info (str): One raw data information loaded
+                from ``ann_file``.
+
+        Returns:
+            (dict): Parsed annotation.
+        """
+        data_info = {}
+        parsed_anno = self.parser(raw_anno_info)
+        img_path = osp.join(self.data_prefix['img_path'],
+                            parsed_anno[self.parser.keys[0]])
+
+        data_info['img_path'] = img_path
+        data_info['instances'] = [dict(text=parsed_anno[self.parser.keys[1]])]
+        return data_info
--- a/mmocr/datasets/utils/backend.py
+++ b/mmocr/datasets/utils/backend.py
@ -11,6 +11,7 @@ from mmocr import digit_version
 from mmocr.utils import list_from_file


+# TODO: remove
 class LmdbAnnFileBackend:
    """Lmdb storage backend for annotation file.

--- a/mmocr/datasets/utils/loader.py
+++ b/mmocr/datasets/utils/loader.py
@ -6,6 +6,7 @@ from .backend import (HardDiskAnnFileBackend, HTTPAnnFileBackend,
                      PetrelAnnFileBackend)


+# TODO: remove
@LOADERS.register_module()
 class AnnFileLoader:
    """Annotation file loader to load annotations from ann_file, and parse raw
--- a/mmocr/datasets/utils/parser.py
+++ b/mmocr/datasets/utils/parser.py
@ -3,11 +3,11 @@ import json
 import warnings
 from typing import Dict, Tuple

-from mmocr.datasets.builder import PARSERS
+from mmocr.registry import TASK_UTILS
 from mmocr.utils import StringStrip


-@PARSERS.register_module()
+@TASK_UTILS.register_module()
 class LineStrParser:
    """Parse string of one line in annotation file to dict format.

@ -57,7 +57,7 @@ class LineStrParser:
        return line_info


-@PARSERS.register_module()
+@TASK_UTILS.register_module()
 class LineJsonParser:
    """Parse json-string of one line in annotation file to dict format.

--- a/tests/data/recog_toy_dataset/imgs.lmdb/data.mdb
+++ b/tests/data/recog_toy_dataset/imgs.lmdb/data.mdb
--- a/tests/data/recog_toy_dataset/imgs.lmdb/lock.mdb
+++ b/tests/data/recog_toy_dataset/imgs.lmdb/lock.mdb
--- a/tests/data/recog_toy_dataset/label.lmdb/data.mdb
+++ b/tests/data/recog_toy_dataset/label.lmdb/data.mdb
--- a/tests/data/recog_toy_dataset/label.lmdb/lock.mdb
+++ b/tests/data/recog_toy_dataset/label.lmdb/lock.mdb
--- a/tests/data/recog_toy_dataset/old_label.jsonl
+++ b/tests/data/recog_toy_dataset/old_label.jsonl
@ -0,0 +1,10 @@
+{"filename": "1223731.jpg", "text": "GRAND"}
+{"filename": "1223733.jpg", "text": "HOTEL"}
+{"filename": "1223732.jpg", "text": "HOTEL"}
+{"filename": "1223729.jpg", "text": "PACIFIC"}
+{"filename": "1036169.jpg", "text": "03/09/2009"}
+{"filename": "1190237.jpg", "text": "ANING"}
+{"filename": "1058891.jpg", "text": "Virgin"}
+{"filename": "1058892.jpg", "text": "america"}
+{"filename": "1240078.jpg", "text": "ATTACK"}
+{"filename": "1210236.jpg", "text": "DAVIDSON"}
--- a/tests/data/recog_toy_dataset/old_label.txt
+++ b/tests/data/recog_toy_dataset/old_label.txt
@ -0,0 +1,10 @@
+1223731.jpg GRAND
+1223733.jpg HOTEL
+1223732.jpg HOTEL
+1223729.jpg PACIFIC
+1036169.jpg 03/09/2009
+1190237.jpg ANING
+1058891.jpg Virgin
+1058892.jpg america
+1240078.jpg ATTACK
+1210236.jpg DAVIDSON
--- a/tests/test_datasets/test_parsers.py
+++ b/tests/test_datasets/test_parsers.py
@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from unittest import TestCase
+
+from mmocr.datasets import LineJsonParser, LineStrParser
+
+
+class TestParser(TestCase):
+
+    def test_line_json_parser(self):
+        parser = LineJsonParser()
+        line = json.dumps(dict(filename='test.jpg', text='mmocr'))
+        data = parser(line)
+        self.assertEqual(data['filename'], 'test.jpg')
+        self.assertEqual(data['text'], 'mmocr')
+
+    def test_line_str_parser(self):
+        parser = LineStrParser()
+        line = 'test.jpg mmocr'
+        data = parser(line)
+        self.assertEqual(data['filename'], 'test.jpg')
+        self.assertEqual(data['text'], 'mmocr')
--- a/tests/test_datasets/test_pipelines/test_recog_lmdb_dataset.py
+++ b/tests/test_datasets/test_pipelines/test_recog_lmdb_dataset.py
@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from unittest import TestCase
+
+import lmdb
+
+from mmocr.datasets import RecogLMDBDataset
+
+
+class TestRecogLMDBDataset(TestCase):
+
+    def create_deprecated_format_lmdb(self, temp_dir):
+        os.makedirs(temp_dir, exist_ok=True)
+        env = lmdb.open(temp_dir, map_size=102400)
+        cache = [(str(0).encode('utf-8'), b'test test')]
+        with env.begin(write=True) as txn:
+            cursor = txn.cursor()
+            cursor.putmulti(cache, dupdata=False, overwrite=True)
+
+        cache = []
+        cache.append((b'total_number', str(1).encode('utf-8')))
+        with env.begin(write=True) as txn:
+            cursor = txn.cursor()
+            cursor.putmulti(cache, dupdata=False, overwrite=True)
+
+    def test_label_only_dataset(self):
+
+        # test initialization
+        dataset = RecogLMDBDataset(
+            ann_file='tests/data/recog_toy_dataset/label.lmdb',
+            data_prefix=dict(img_path='imgs'),
+            pipeline=[])
+        dataset.full_init()
+        self.assertEqual(len(dataset), 10)
+        self.assertEqual(len(dataset.load_data_list()), 10)
+
+        # test load_data_list
+        anno = dataset.load_data_list()[0]
+        self.assertEqual(anno['img_path'], 'imgs/1223731.jpg')
+        self.assertEqual(anno['instances'][0]['text'], 'GRAND')
+
+    def test_label_and_image_dataset(self):
+
+        # test initialization
+        dataset = RecogLMDBDataset(
+            ann_file='tests/data/recog_toy_dataset/imgs.lmdb',
+            data_prefix=dict(img_path='imgs'),
+            pipeline=[])
+        dataset.full_init()
+        self.assertEqual(len(dataset), 10)
+        self.assertEqual(len(dataset.load_data_list()), 10)
+
+        # test load_data_list
+        anno = dataset.load_data_list()[0]
+        self.assertEqual(anno['img_path'], f'imgs/image-{1:09d}')
+        self.assertEqual(anno['instances'][0]['text'], 'GRAND')
+
+    def test_deprecated_format(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.create_deprecated_format_lmdb(
+                os.path.join(tmpdirname, 'data'))
+            dataset = RecogLMDBDataset(
+                ann_file=os.path.join(tmpdirname, 'data'),
+                data_prefix=dict(img_path='imgs'),
+                pipeline=[])
+
+            warm_msg = 'DeprecationWarning: The lmdb dataset generated with '
+            warm_msg += 'txt2lmdb will be deprecate, please use the latest '
+            warm_msg += 'tools/data/utils/recog2lmdb to generate lmdb dataset.'
+            warm_msg += ' See https://mmocr.readthedocs.io/en/'
+            warm_msg += 'latest/tools.html#'
+            warm_msg += 'convert-text-recognition-dataset-to-lmdb-format for '
+            warm_msg += 'details.'
+
+            dataset.full_init()
+            self.assertWarnsRegex(UserWarning, warm_msg)
--- a/tests/test_datasets/test_pipelines/test_recog_text_dataset.py
+++ b/tests/test_datasets/test_pipelines/test_recog_text_dataset.py
@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+from mmocr.datasets import RecogTextDataset
+
+
+class TestRecogTextDataset(TestCase):
+
+    def test_txt_dataset(self):
+
+        # test initialization
+        dataset = RecogTextDataset(
+            ann_file='tests/data/recog_toy_dataset/old_label.txt',
+            data_prefix=dict(img_path='imgs'),
+            parser_cfg=dict(
+                type='LineStrParser',
+                keys=['filename', 'text'],
+                keys_idx=[0, 1]),
+            pipeline=[])
+        dataset.full_init()
+        self.assertEqual(len(dataset), 10)
+        self.assertEqual(len(dataset.load_data_list()), 10)
+
+        # test load_data_list
+        anno = dataset.load_data_list()
+        self.assertEqual(anno[0]['img_path'], 'imgs/1223731.jpg')
+        self.assertEqual(anno[0]['instances'][0]['text'], 'GRAND')
+        self.assertEqual(anno[1]['img_path'], 'imgs/1223733.jpg')
+        self.assertEqual(anno[1]['instances'][0]['text'], 'HOTEL')
+
+    def test_jsonl_dataset(self):
+        dataset = RecogTextDataset(
+            ann_file='tests/data/recog_toy_dataset/old_label.jsonl',
+            data_prefix=dict(img_path='imgs'),
+            parser_cfg=dict(type='LineJsonParser', keys=['filename', 'text']),
+            pipeline=[])
+        dataset.full_init()
+        self.assertEqual(len(dataset), 10)
+        self.assertEqual(len(dataset.load_data_list()), 10)
+
+        # test load_data_list
+        anno = dataset.load_data_list()
+        self.assertEqual(anno[0]['img_path'], 'imgs/1223731.jpg')
+        self.assertEqual(anno[0]['instances'][0]['text'], 'GRAND')
+        self.assertEqual(anno[1]['img_path'], 'imgs/1223733.jpg')
+        self.assertEqual(anno[1]['instances'][0]['text'], 'HOTEL')