[Refactor] Update datasets (#1375)

* add ut * add places205 * support ann_file without labels * temp test * update custom * update * update ut * Update CustomDataset. * Update Places205. --------- Co-authored-by: mzr1996 <mzr1996@163.com>
2025-06-03 21:53:55 +08:00 · 2023-02-27 15:42:22 +08:00 · 2023-02-27 15:42:22 +08:00 · 75c79311f4
commit 75c79311f4
parent 89000c10eb
9 changed files with 285 additions and 74 deletions
--- a/mmpretrain/datasets/init.py
+++ b/mmpretrain/datasets/init.py
@ -10,6 +10,7 @@ from .inshop import InShop
 from .mnist import MNIST, FashionMNIST
 from .multi_label import MultiLabelDataset
 from .multi_task import MultiTaskDataset
 from .places205 import Places205
 from .samplers import *  # noqa: F401,F403
 from .transforms import *  # noqa: F401,F403
 from .voc import VOC
@ -17,5 +18,6 @@ from .voc import VOC
 __all__ = [
    'BaseDataset', 'ImageNet', 'CIFAR10', 'CIFAR100', 'MNIST', 'FashionMNIST',
    'VOC', 'build_dataset', 'ImageNet21k', 'KFoldDataset', 'CUB',
-    'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset', 'InShop'
+    'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset', 'InShop',
    'Places205'
 ]
--- a/mmpretrain/datasets/base_dataset.py
+++ b/mmpretrain/datasets/base_dataset.py
@ -198,8 +198,6 @@ class BaseDataset(_BaseDataset):
        if self.CLASSES is not None:
            body.append(f'Number of categories: \t{len(self.CLASSES)}')
        else:
            body.append('The `CLASSES` meta info is not set.')
        body.extend(self.extra_repr())
--- a/mmpretrain/datasets/categories.py
+++ b/mmpretrain/datasets/categories.py
@ -1096,3 +1096,48 @@ MNIST_CATEGORITES = ('0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
 FASHIONMNIST_CATEGORITES = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress',
                            'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag',
                            'Ankle boot')
 PLACES205_CATEGORIES = (
    'abbey', 'airport_terminal', 'alley', 'amphitheater', 'amusement_park',
    'aquarium', 'aqueduct', 'arch', 'art_gallery', 'art_studio',
    'assembly_line', 'attic', 'auditorium', 'apartment_building/outdoor',
    'badlands', 'ballroom', 'bamboo_forest', 'banquet_hall', 'bar',
    'baseball_field', 'basement', 'basilica', 'bayou', 'beauty_salon',
    'bedroom', 'boardwalk', 'boat_deck', 'bookstore', 'botanical_garden',
    'bowling_alley', 'boxing_ring', 'bridge', 'building_facade',
    'bus_interior', 'butchers_shop', 'butte', 'bakery/shop', 'cafeteria',
    'campsite', 'candy_store', 'canyon', 'castle', 'cemetery', 'chalet',
    'classroom', 'closet', 'clothing_store', 'coast', 'cockpit', 'coffee_shop',
    'conference_center', 'conference_room', 'construction_site', 'corn_field',
    'corridor', 'cottage_garden', 'courthouse', 'courtyard', 'creek',
    'crevasse', 'crosswalk', 'cathedral/outdoor', 'church/outdoor', 'dam',
    'dining_room', 'dock', 'dorm_room', 'driveway', 'desert/sand',
    'desert/vegetation', 'dinette/home', 'doorway/outdoor', 'engine_room',
    'excavation', 'fairway', 'fire_escape', 'fire_station', 'food_court',
    'forest_path', 'forest_road', 'formal_garden', 'fountain',
    'field/cultivated', 'field/wild', 'galley', 'game_room', 'garbage_dump',
    'gas_station', 'gift_shop', 'golf_course', 'harbor', 'herb_garden',
    'highway', 'home_office', 'hospital', 'hospital_room', 'hot_spring',
    'hotel_room', 'hotel/outdoor', 'ice_cream_parlor', 'iceberg', 'igloo',
    'islet', 'ice_skating_rink/outdoor', 'inn/outdoor', 'jail_cell', 'kasbah',
    'kindergarden_classroom', 'kitchen', 'kitchenette', 'laundromat',
    'lighthouse', 'living_room', 'lobby', 'locker_room', 'mansion', 'marsh',
    'martial_arts_gym', 'mausoleum', 'medina', 'motel', 'mountain',
    'mountain_snowy', 'music_studio', 'market/outdoor', 'monastery/outdoor',
    'museum/indoor', 'nursery', 'ocean', 'office', 'office_building',
    'orchard', 'pagoda', 'palace', 'pantry', 'parking_lot', 'parlor',
    'pasture', 'patio', 'pavilion', 'phone_booth', 'picnic_area', 'playground',
    'plaza', 'pond', 'pulpit', 'racecourse', 'raft', 'railroad_track',
    'rainforest', 'reception', 'residential_neighborhood', 'restaurant',
    'restaurant_kitchen', 'restaurant_patio', 'rice_paddy', 'river',
    'rock_arch', 'rope_bridge', 'ruin', 'runway', 'sandbar', 'schoolhouse',
    'sea_cliff', 'shed', 'shoe_shop', 'shopfront', 'shower', 'ski_resort',
    'ski_slope', 'sky', 'skyscraper', 'slum', 'snowfield', 'staircase',
    'supermarket', 'swamp', 'stadium/baseball', 'stadium/football',
    'stage/indoor', 'subway_station/platform', 'swimming_pool/outdoor',
    'television_studio', 'topiary_garden', 'tower', 'train_railway',
    'tree_farm', 'trench', 'temple/east_asia', 'temple/south_asia',
    'track/outdoor', 'train_station/platform', 'underwater/coral_reef',
    'valley', 'vegetable_garden', 'veranda', 'viaduct', 'volcano',
    'waiting_room', 'water_tower', 'watering_hole', 'wheat_field', 'wind_farm',
    'windmill', 'yard')
--- a/mmpretrain/datasets/custom.py
+++ b/mmpretrain/datasets/custom.py
@ -67,59 +67,78 @@ def get_samples(
    # Pre-build file backend to prevent verbose file backend inference.
    backend = backend or get_file_backend(root, enable_singleton=True)
-    for folder_name in sorted(list(folder_to_idx.keys())):
+    if folder_to_idx is not None:
-        _dir = backend.join_path(root, folder_name)
+        for folder_name in sorted(list(folder_to_idx.keys())):
            _dir = backend.join_path(root, folder_name)
            files = backend.list_dir_or_file(
                _dir,
                list_dir=False,
                list_file=True,
                recursive=True,
            )
            for file in sorted(list(files)):
                if is_valid_file(file):
                    path = backend.join_path(folder_name, file)
                    item = (path, folder_to_idx[folder_name])
                    samples.append(item)
                    available_classes.add(folder_name)
        empty_folders = set(folder_to_idx.keys()) - available_classes
    else:
        files = backend.list_dir_or_file(
-            _dir,
+            root,
            list_dir=False,
            list_file=True,
            recursive=True,
        )
-        for file in sorted(list(files)):
+        samples = [file for file in sorted(list(files)) if is_valid_file(file)]
-            if is_valid_file(file):
+        empty_folders = None
                path = backend.join_path(folder_name, file)
                item = (path, folder_to_idx[folder_name])
                samples.append(item)
                available_classes.add(folder_name)
    empty_folders = set(folder_to_idx.keys()) - available_classes
    return samples, empty_folders
@DATASETS.register_module()
 class CustomDataset(BaseDataset):
-    """Custom dataset for classification.
+    """A generic dataset for multiple tasks.
-    The dataset supports two kinds of annotation format.
+    The dataset supports two kinds of style.
-    1. An annotation file is provided, and each line indicates a sample:
+    1. Use an annotation file to specify all samples, and each line indicates a
       sample:
-       The sample files: ::
+       The annotation file (for ``with_label=True``, supervised tasks.): ::
           folder_1/xxx.png 0
           folder_1/xxy.png 1
           123.png 4
           nsdf3.png 3
           ...
       The annotation file (for ``with_label=False``, unsupervised tasks.): ::
           folder_1/xxx.png
           folder_1/xxy.png
           123.png
           nsdf3.png
           ...
       Sample files: ::
           data_prefix/
           ├── folder_1
           │   ├── xxx.png
           │   ├── xxy.png
           │   └── ...
-           └── folder_2
+           ├── 123.png
-               ├── 123.png
+           ├── nsdf3.png
-               ├── nsdf3.png
+           └── ...
               └── ...
-       The annotation file (the first column is the image path and the second
+       Please use the argument ``metainfo`` to specify extra information for
-       column is the index of category): ::
+       the task, like ``{'classes': ('bird', 'cat', 'deer', 'dog', 'frog')}``.
-            folder_1/xxx.png 0
+    2. Place all samples in one folder as below:
            folder_1/xxy.png 1
            folder_2/123.png 5
            folder_2/nsdf3.png 3
            ...
-       Please specify the name of categories by the argument ``classes``
+       Sample files (for ``with_label=True``, supervised tasks, we use the name
-       or ``metainfo``.
+       of sub-folders as the categories names): ::
    2. The samples are arranged in the specific way: ::
           data_prefix/
           ├── class_x
@ -133,18 +152,33 @@ class CustomDataset(BaseDataset):
               ├── ...
               └── asd932_.png
       Sample files (for ``with_label=False``, unsupervised tasks, we use all
       sample files under the specified folder): ::
           data_prefix/
           ├── folder_1
           │   ├── xxx.png
           │   ├── xxy.png
           │   └── ...
           ├── 123.png
           ├── nsdf3.png
           └── ...
    If the ``ann_file`` is specified, the dataset will be generated by the
    first way, otherwise, try the second way.
    Args:
        ann_file (str): Annotation file path. Defaults to ''.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        data_root (str): The root directory for ``data_prefix`` and
            ``ann_file``. Defaults to ''.
        data_prefix (str | dict): Prefix for the data. Defaults to ''.
        ann_file (str): Annotation file path. Defaults to ''.
        with_label (bool): Whether the annotation file includes ground truth
            labels, or use sub-folders to specify categories.
            Defaults to True.
        extensions (Sequence[str]): A sequence of allowed extensions. Defaults
            to ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif').
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        lazy_init (bool): Whether to load annotation during instantiation.
            In some cases, such as visualization, only the meta information of
            the dataset is needed, which is not necessary to load annotation
@ -154,12 +188,13 @@ class CustomDataset(BaseDataset):
    """
    def __init__(self,
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 data_root: str = '',
                 data_prefix: Union[str, dict] = '',
                 ann_file: str = '',
                 with_label=True,
                 extensions: Sequence[str] = ('.jpg', '.jpeg', '.png', '.ppm',
                                              '.bmp', '.pgm', '.tif'),
                 metainfo: Optional[dict] = None,
                 lazy_init: bool = False,
                 **kwargs):
        assert (ann_file or data_prefix or data_root), \
@ -167,6 +202,7 @@ class CustomDataset(BaseDataset):
            'be specified.'
        self.extensions = tuple(set([i.lower() for i in extensions]))
        self.with_label = with_label
        super().__init__(
            # The base class requires string ann_file but this class doesn't
@ -184,26 +220,35 @@ class CustomDataset(BaseDataset):
    def _find_samples(self):
        """find samples from ``data_prefix``."""
-        classes, folder_to_idx = find_folders(self.img_prefix)
+        if self.with_label:
-        samples, empty_classes = get_samples(
+            classes, folder_to_idx = find_folders(self.img_prefix)
-            self.img_prefix,
+            samples, empty_classes = get_samples(
-            folder_to_idx,
+                self.img_prefix,
-            is_valid_file=self.is_valid_file,
+                folder_to_idx,
-        )
+                is_valid_file=self.is_valid_file,
            )
            self.folder_to_idx = folder_to_idx
            if self.CLASSES is not None:
                assert len(self.CLASSES) == len(classes), \
                    f"The number of subfolders ({len(classes)}) doesn't " \
                    f'match the number of specified classes ' \
                    f'({len(self.CLASSES)}). Please check the data folder.'
            else:
                self._metainfo['classes'] = tuple(classes)
        else:
            samples, empty_classes = get_samples(
                self.img_prefix,
                None,
                is_valid_file=self.is_valid_file,
            )
        if len(samples) == 0:
            raise RuntimeError(
                f'Found 0 files in subfolders of: {self.data_prefix}. '
                f'Supported extensions are: {",".join(self.extensions)}')
        if self.CLASSES is not None:
            assert len(self.CLASSES) == len(classes), \
                f"The number of subfolders ({len(classes)}) doesn't match " \
                f'the number of specified classes ({len(self.CLASSES)}). ' \
                'Please check the data folder.'
        else:
            self._metainfo['classes'] = tuple(classes)
        if empty_classes:
            logger = MMLogger.get_current_instance()
            logger.warning(
@ -211,24 +256,29 @@ class CustomDataset(BaseDataset):
                f'{", ".join(empty_classes)}. '
                f"Supported extensions are: {', '.join(self.extensions)}")
        self.folder_to_idx = folder_to_idx
        return samples
    def load_data_list(self):
        """Load image paths and gt_labels."""
        if not self.ann_file:
            samples = self._find_samples()
-        else:
+        elif self.with_label:
            lines = list_from_file(self.ann_file)
            samples = [x.strip().rsplit(' ', 1) for x in lines]
        else:
            samples = list_from_file(self.ann_file)
        # Pre-build file backend to prevent verbose file backend inference.
        backend = get_file_backend(self.img_prefix, enable_singleton=True)
        data_list = []
-        for filename, gt_label in samples:
+        for sample in samples:
-            img_path = backend.join_path(self.img_prefix, filename)
+            if self.with_label:
-            info = {'img_path': img_path, 'gt_label': int(gt_label)}
+                filename, gt_label = sample
                img_path = backend.join_path(self.img_prefix, filename)
                info = {'img_path': img_path, 'gt_label': int(gt_label)}
            else:
                img_path = backend.join_path(self.img_prefix, sample)
                info = {'img_path': img_path}
            data_list.append(info)
        return data_list
--- a/mmpretrain/datasets/imagenet.py
+++ b/mmpretrain/datasets/imagenet.py
@ -16,12 +16,12 @@ class ImageNet(CustomDataset):
    found in :class:`CustomDataset`.
    Args:
        ann_file (str): Annotation file path. Defaults to ''.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        data_root (str): The root directory for ``data_prefix`` and
            ``ann_file``. Defaults to ''.
        data_prefix (str | dict): Prefix for training data. Defaults to ''.
        ann_file (str): Annotation file path. Defaults to ''.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        **kwargs: Other keyword arguments in :class:`CustomDataset` and
            :class:`BaseDataset`.
    """  # noqa: E501
@ -30,17 +30,17 @@ class ImageNet(CustomDataset):
    METAINFO = {'classes': IMAGENET_CATEGORIES}
    def __init__(self,
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 data_root: str = '',
                 data_prefix: Union[str, dict] = '',
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 **kwargs):
        kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
        super().__init__(
            ann_file=ann_file,
            metainfo=metainfo,
            data_root=data_root,
            data_prefix=data_prefix,
            ann_file=ann_file,
            metainfo=metainfo,
            **kwargs)
@ -53,12 +53,12 @@ class ImageNet21k(CustomDataset):
    specify it from the ``classes`` argument.
    Args:
        ann_file (str): Annotation file path. Defaults to ''.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        data_root (str): The root directory for ``data_prefix`` and
            ``ann_file``. Defaults to ''.
        data_prefix (str | dict): Prefix for training data. Defaults to ''.
        ann_file (str): Annotation file path. Defaults to ''.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        multi_label (bool): Not implement by now. Use multi label or not.
            Defaults to False.
        **kwargs: Other keyword arguments in :class:`CustomDataset` and
@ -68,10 +68,10 @@ class ImageNet21k(CustomDataset):
    IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif')
    def __init__(self,
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 data_root: str = '',
                 data_prefix: Union[str, dict] = '',
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 multi_label: bool = False,
                 **kwargs):
        if multi_label:
@ -89,10 +89,10 @@ class ImageNet21k(CustomDataset):
        kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
        super().__init__(
            ann_file=ann_file,
            metainfo=metainfo,
            data_root=data_root,
            data_prefix=data_prefix,
            ann_file=ann_file,
            metainfo=metainfo,
            **kwargs)
        if self.CLASSES is None:
--- a/mmpretrain/datasets/places205.py
+++ b/mmpretrain/datasets/places205.py
@ -0,0 +1,40 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Optional, Union
 from mmpretrain.registry import DATASETS
 from .categories import PLACES205_CATEGORIES
 from .custom import CustomDataset
@DATASETS.register_module()
 class Places205(CustomDataset):
    """`Places205 <http://places.csail.mit.edu/downloadData.html>`_ Dataset.
    Args:
        data_root (str): The root directory for ``data_prefix`` and
            ``ann_file``. Defaults to ''.
        data_prefix (str | dict): Prefix for training data. Defaults
            to ''.
        ann_file (str): Annotation file path. Defaults to ''.
        metainfo (dict, optional): Meta information for dataset, such as class
            information. Defaults to None.
        **kwargs: Other keyword arguments in :class:`CustomDataset` and
            :class:`BaseDataset`.
    """
    IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif')
    METAINFO = {'classes': PLACES205_CATEGORIES}
    def __init__(self,
                 data_root: str = '',
                 data_prefix: Union[str, dict] = '',
                 ann_file: str = '',
                 metainfo: Optional[dict] = None,
                 **kwargs):
        kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
        super().__init__(
            data_root=data_root,
            data_prefix=data_prefix,
            ann_file=ann_file,
            metainfo=metainfo,
            **kwargs)
--- a/tests/data/dataset/3.jpeg
+++ b/tests/data/dataset/3.jpeg
--- a/tests/data/dataset/ann_without_labels.txt
+++ b/tests/data/dataset/ann_without_labels.txt
@ -0,0 +1,3 @@
 a/1.JPG
 b/2.jpeg
 b/subb/3.jpg
--- a/tests/test_datasets/test_datasets.py
+++ b/tests/test_datasets/test_datasets.py
@ -73,8 +73,6 @@ class TestBaseDataset(TestCase):
            num_classes = len(dataset.CLASSES)
            self.assertIn(f'Number of categories: \t{num_classes}',
                          repr(dataset))
        else:
            self.assertIn('The `CLASSES` meta info is not set.', repr(dataset))
        self.assertIn('Haven\'t been initialized', repr(dataset))
        dataset.full_init()
@ -148,6 +146,30 @@ class TestCustomDataset(TestBaseDataset):
                'gt_label': 1
            }.items())
        # test load without ann_file and without labels
        # (no specific folder structures)
        cfg = {
            **self.DEFAULT_ARGS,
            'data_prefix': ASSETS_ROOT,
            'ann_file': '',
            'with_label': False,
        }
        dataset = dataset_class(**cfg)
        self.assertEqual(len(dataset), 4)
        self.assertIsNone(dataset.CLASSES, None)
        self.assertGreaterEqual(
            dataset.get_data_info(0).items(), {
                'img_path': osp.join(ASSETS_ROOT, '3.jpeg'),
            }.items())
        self.assertGreaterEqual(
            dataset.get_data_info(1).items(), {
                'img_path': osp.join(ASSETS_ROOT, 'a', '1.JPG'),
            }.items())
        self.assertGreaterEqual(
            dataset.get_data_info(3).items(), {
                'img_path': osp.join(ASSETS_ROOT, 'b', 'subb', '3.jpg'),
            }.items())
        # test ann_file assertion
        cfg = {
            **self.DEFAULT_ARGS,
@ -201,6 +223,27 @@ class TestCustomDataset(TestBaseDataset):
                'gt_label': 1
            }.items())
        # test load with absolute ann_file and without label
        cfg = {
            **self.DEFAULT_ARGS,
            'data_root': '',
            'data_prefix': '',
            'ann_file': osp.join(ASSETS_ROOT, 'ann_without_labels.txt'),
            'with_label': False,
        }
        dataset = dataset_class(**cfg)
        self.assertEqual(len(dataset), 3)
        # custom dataset won't infer CLASSES from ann_file
        self.assertIsNone(dataset.CLASSES, None)
        self.assertGreaterEqual(
            dataset.get_data_info(0).items(), {
                'img_path': 'a/1.JPG',
            }.items())
        self.assertGreaterEqual(
            dataset.get_data_info(2).items(), {
                'img_path': 'b/subb/3.jpg',
            }.items())
        # test extensions filter
        cfg = {
            **self.DEFAULT_ARGS, 'data_prefix': dict(img_path=ASSETS_ROOT),
@ -302,6 +345,36 @@ class TestImageNet21k(TestCustomDataset):
        self.assertIn('specify the `classes`', log.output[0])
 class TestPlaces205(TestCustomDataset):
    DATASET_TYPE = 'Places205'
    DEFAULT_ARGS = dict(data_root=ASSETS_ROOT, ann_file='ann.txt')
    def test_load_data_list(self):
        dataset_class = DATASETS.get(self.DATASET_TYPE)
        # test classes number
        cfg = {
            **self.DEFAULT_ARGS,
            'data_prefix': ASSETS_ROOT,
            'ann_file': '',
        }
        with self.assertRaisesRegex(AssertionError,
                                    r"\(2\) doesn't match .* classes \(205\)"):
            dataset_class(**cfg)
        # test override classes
        cfg = {
            **self.DEFAULT_ARGS,
            'data_prefix': ASSETS_ROOT,
            'classes': ['cat', 'dog'],
            'ann_file': '',
        }
        dataset = dataset_class(**cfg)
        self.assertEqual(len(dataset), 3)
        self.assertEqual(dataset.CLASSES, ('cat', 'dog'))
 class TestCIFAR10(TestBaseDataset):
    DATASET_TYPE = 'CIFAR10'