[Refactor] Update datasets (#1375)

* add ut * add places205 * support ann_file without labels * temp test * update custom * update * update ut * Update CustomDataset. * Update Places205. --------- Co-authored-by: mzr1996 <mzr1996@163.com>
2025-06-03 21:53:55 +08:00 · 2023-02-27 15:42:22 +08:00 · 2023-02-27 15:42:22 +08:00 · 75c79311f4
commit 75c79311f4
parent 89000c10eb
9 changed files with 285 additions and 74 deletions
--- a/mmpretrain/datasets/init.py
+++ b/mmpretrain/datasets/init.py
@ -10,6 +10,7 @@ from .inshop import InShop
 from .mnist import MNIST, FashionMNIST
 from .multi_label import MultiLabelDataset
 from .multi_task import MultiTaskDataset
+from .places205 import Places205
 from .samplers import *  # noqa: F401,F403
 from .transforms import *  # noqa: F401,F403
 from .voc import VOC
@ -17,5 +18,6 @@ from .voc import VOC
 __all__ = [
    'BaseDataset', 'ImageNet', 'CIFAR10', 'CIFAR100', 'MNIST', 'FashionMNIST',
    'VOC', 'build_dataset', 'ImageNet21k', 'KFoldDataset', 'CUB',
-    'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset', 'InShop'
+    'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset', 'InShop',
+    'Places205'
 ]
--- a/mmpretrain/datasets/base_dataset.py
+++ b/mmpretrain/datasets/base_dataset.py
@ -198,8 +198,6 @@ class BaseDataset(_BaseDataset):

        if self.CLASSES is not None:
            body.append(f'Number of categories: \t{len(self.CLASSES)}')
-        else:
-            body.append('The `CLASSES` meta info is not set.')

        body.extend(self.extra_repr())

--- a/mmpretrain/datasets/categories.py
+++ b/mmpretrain/datasets/categories.py
@ -1096,3 +1096,48 @@ MNIST_CATEGORITES = ('0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
 FASHIONMNIST_CATEGORITES = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress',
                            'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag',
                            'Ankle boot')
+
+PLACES205_CATEGORIES = (
+    'abbey', 'airport_terminal', 'alley', 'amphitheater', 'amusement_park',
+    'aquarium', 'aqueduct', 'arch', 'art_gallery', 'art_studio',
+    'assembly_line', 'attic', 'auditorium', 'apartment_building/outdoor',
+    'badlands', 'ballroom', 'bamboo_forest', 'banquet_hall', 'bar',
+    'baseball_field', 'basement', 'basilica', 'bayou', 'beauty_salon',
+    'bedroom', 'boardwalk', 'boat_deck', 'bookstore', 'botanical_garden',
+    'bowling_alley', 'boxing_ring', 'bridge', 'building_facade',
+    'bus_interior', 'butchers_shop', 'butte', 'bakery/shop', 'cafeteria',
+    'campsite', 'candy_store', 'canyon', 'castle', 'cemetery', 'chalet',
+    'classroom', 'closet', 'clothing_store', 'coast', 'cockpit', 'coffee_shop',
+    'conference_center', 'conference_room', 'construction_site', 'corn_field',
+    'corridor', 'cottage_garden', 'courthouse', 'courtyard', 'creek',
+    'crevasse', 'crosswalk', 'cathedral/outdoor', 'church/outdoor', 'dam',
+    'dining_room', 'dock', 'dorm_room', 'driveway', 'desert/sand',
+    'desert/vegetation', 'dinette/home', 'doorway/outdoor', 'engine_room',
+    'excavation', 'fairway', 'fire_escape', 'fire_station', 'food_court',
+    'forest_path', 'forest_road', 'formal_garden', 'fountain',
+    'field/cultivated', 'field/wild', 'galley', 'game_room', 'garbage_dump',
+    'gas_station', 'gift_shop', 'golf_course', 'harbor', 'herb_garden',
+    'highway', 'home_office', 'hospital', 'hospital_room', 'hot_spring',
+    'hotel_room', 'hotel/outdoor', 'ice_cream_parlor', 'iceberg', 'igloo',
+    'islet', 'ice_skating_rink/outdoor', 'inn/outdoor', 'jail_cell', 'kasbah',
+    'kindergarden_classroom', 'kitchen', 'kitchenette', 'laundromat',
+    'lighthouse', 'living_room', 'lobby', 'locker_room', 'mansion', 'marsh',
+    'martial_arts_gym', 'mausoleum', 'medina', 'motel', 'mountain',
+    'mountain_snowy', 'music_studio', 'market/outdoor', 'monastery/outdoor',
+    'museum/indoor', 'nursery', 'ocean', 'office', 'office_building',
+    'orchard', 'pagoda', 'palace', 'pantry', 'parking_lot', 'parlor',
+    'pasture', 'patio', 'pavilion', 'phone_booth', 'picnic_area', 'playground',
+    'plaza', 'pond', 'pulpit', 'racecourse', 'raft', 'railroad_track',
+    'rainforest', 'reception', 'residential_neighborhood', 'restaurant',
+    'restaurant_kitchen', 'restaurant_patio', 'rice_paddy', 'river',
+    'rock_arch', 'rope_bridge', 'ruin', 'runway', 'sandbar', 'schoolhouse',
+    'sea_cliff', 'shed', 'shoe_shop', 'shopfront', 'shower', 'ski_resort',
+    'ski_slope', 'sky', 'skyscraper', 'slum', 'snowfield', 'staircase',
+    'supermarket', 'swamp', 'stadium/baseball', 'stadium/football',
+    'stage/indoor', 'subway_station/platform', 'swimming_pool/outdoor',
+    'television_studio', 'topiary_garden', 'tower', 'train_railway',
+    'tree_farm', 'trench', 'temple/east_asia', 'temple/south_asia',
+    'track/outdoor', 'train_station/platform', 'underwater/coral_reef',
+    'valley', 'vegetable_garden', 'veranda', 'viaduct', 'volcano',
+    'waiting_room', 'water_tower', 'watering_hole', 'wheat_field', 'wind_farm',
+    'windmill', 'yard')
--- a/mmpretrain/datasets/custom.py
+++ b/mmpretrain/datasets/custom.py
@ -67,59 +67,78 @@ def get_samples(
    # Pre-build file backend to prevent verbose file backend inference.
    backend = backend or get_file_backend(root, enable_singleton=True)

-    for folder_name in sorted(list(folder_to_idx.keys())):
-        _dir = backend.join_path(root, folder_name)
+    if folder_to_idx is not None:
+        for folder_name in sorted(list(folder_to_idx.keys())):
+            _dir = backend.join_path(root, folder_name)
+            files = backend.list_dir_or_file(
+                _dir,
+                list_dir=False,
+                list_file=True,
+                recursive=True,
+            )
+            for file in sorted(list(files)):
+                if is_valid_file(file):
+                    path = backend.join_path(folder_name, file)
+                    item = (path, folder_to_idx[folder_name])
+                    samples.append(item)
+                    available_classes.add(folder_name)
+        empty_folders = set(folder_to_idx.keys()) - available_classes
+    else:
        files = backend.list_dir_or_file(
-            _dir,
+            root,
            list_dir=False,
            list_file=True,
            recursive=True,
        )
-        for file in sorted(list(files)):
-            if is_valid_file(file):
-                path = backend.join_path(folder_name, file)
-                item = (path, folder_to_idx[folder_name])
-                samples.append(item)
-                available_classes.add(folder_name)
-
-    empty_folders = set(folder_to_idx.keys()) - available_classes
+        samples = [file for file in sorted(list(files)) if is_valid_file(file)]
+        empty_folders = None

    return samples, empty_folders


@DATASETS.register_module()
 class CustomDataset(BaseDataset):
-    """Custom dataset for classification.
+    """A generic dataset for multiple tasks.

-    The dataset supports two kinds of annotation format.
+    The dataset supports two kinds of style.

-    1. An annotation file is provided, and each line indicates a sample:
+    1. Use an annotation file to specify all samples, and each line indicates a
+       sample:

-       The sample files: ::
+       The annotation file (for ``with_label=True``, supervised tasks.): ::
+
+           folder_1/xxx.png 0
+           folder_1/xxy.png 1
+           123.png 4
+           nsdf3.png 3
+           ...
+
+       The annotation file (for ``with_label=False``, unsupervised tasks.): ::
+
+           folder_1/xxx.png
+           folder_1/xxy.png
+           123.png
+           nsdf3.png
+           ...
+
+       Sample files: ::

           data_prefix/
           ├── folder_1
           │   ├── xxx.png
           │   ├── xxy.png
           │   └── ...
-           └── folder_2
-               ├── 123.png
-               ├── nsdf3.png
-               └── ...
+           ├── 123.png
+           ├── nsdf3.png
+           └── ...

-       The annotation file (the first column is the image path and the second
-       column is the index of category): ::
+       Please use the argument ``metainfo`` to specify extra information for
+       the task, like ``{'classes': ('bird', 'cat', 'deer', 'dog', 'frog')}``.

-            folder_1/xxx.png 0
-            folder_1/xxy.png 1
-            folder_2/123.png 5
-            folder_2/nsdf3.png 3
-            ...
+    2. Place all samples in one folder as below:

-       Please specify the name of categories by the argument ``classes``
-       or ``metainfo``.
-
-    2. The samples are arranged in the specific way: ::
+       Sample files (for ``with_label=True``, supervised tasks, we use the name
+       of sub-folders as the categories names): ::

           data_prefix/
           ├── class_x
@ -133,18 +152,33 @@ class CustomDataset(BaseDataset):
               ├── ...
               └── asd932_.png

+       Sample files (for ``with_label=False``, unsupervised tasks, we use all
+       sample files under the specified folder): ::
+
+           data_prefix/
+           ├── folder_1
+           │   ├── xxx.png
+           │   ├── xxy.png
+           │   └── ...
+           ├── 123.png
+           ├── nsdf3.png
+           └── ...
+
    If the ``ann_file`` is specified, the dataset will be generated by the
    first way, otherwise, try the second way.

    Args:
-        ann_file (str): Annotation file path. Defaults to ''.
-        metainfo (dict, optional): Meta information for dataset, such as class
-            information. Defaults to None.
        data_root (str): The root directory for ``data_prefix`` and
            ``ann_file``. Defaults to ''.
        data_prefix (str | dict): Prefix for the data. Defaults to ''.
+        ann_file (str): Annotation file path. Defaults to ''.
+        with_label (bool): Whether the annotation file includes ground truth
+            labels, or use sub-folders to specify categories.
+            Defaults to True.
        extensions (Sequence[str]): A sequence of allowed extensions. Defaults
            to ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif').
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
        lazy_init (bool): Whether to load annotation during instantiation.
            In some cases, such as visualization, only the meta information of
            the dataset is needed, which is not necessary to load annotation
@ -154,12 +188,13 @@ class CustomDataset(BaseDataset):
    """

    def __init__(self,
-                 ann_file: str = '',
-                 metainfo: Optional[dict] = None,
                 data_root: str = '',
                 data_prefix: Union[str, dict] = '',
+                 ann_file: str = '',
+                 with_label=True,
                 extensions: Sequence[str] = ('.jpg', '.jpeg', '.png', '.ppm',
                                              '.bmp', '.pgm', '.tif'),
+                 metainfo: Optional[dict] = None,
                 lazy_init: bool = False,
                 **kwargs):
        assert (ann_file or data_prefix or data_root), \
@ -167,6 +202,7 @@ class CustomDataset(BaseDataset):
            'be specified.'

        self.extensions = tuple(set([i.lower() for i in extensions]))
+        self.with_label = with_label

        super().__init__(
            # The base class requires string ann_file but this class doesn't
@ -184,26 +220,35 @@ class CustomDataset(BaseDataset):

    def _find_samples(self):
        """find samples from ``data_prefix``."""
-        classes, folder_to_idx = find_folders(self.img_prefix)
-        samples, empty_classes = get_samples(
-            self.img_prefix,
-            folder_to_idx,
-            is_valid_file=self.is_valid_file,
-        )
+        if self.with_label:
+            classes, folder_to_idx = find_folders(self.img_prefix)
+            samples, empty_classes = get_samples(
+                self.img_prefix,
+                folder_to_idx,
+                is_valid_file=self.is_valid_file,
+            )
+
+            self.folder_to_idx = folder_to_idx
+
+            if self.CLASSES is not None:
+                assert len(self.CLASSES) == len(classes), \
+                    f"The number of subfolders ({len(classes)}) doesn't " \
+                    f'match the number of specified classes ' \
+                    f'({len(self.CLASSES)}). Please check the data folder.'
+            else:
+                self._metainfo['classes'] = tuple(classes)
+        else:
+            samples, empty_classes = get_samples(
+                self.img_prefix,
+                None,
+                is_valid_file=self.is_valid_file,
+            )

        if len(samples) == 0:
            raise RuntimeError(
                f'Found 0 files in subfolders of: {self.data_prefix}. '
                f'Supported extensions are: {",".join(self.extensions)}')

-        if self.CLASSES is not None:
-            assert len(self.CLASSES) == len(classes), \
-                f"The number of subfolders ({len(classes)}) doesn't match " \
-                f'the number of specified classes ({len(self.CLASSES)}). ' \
-                'Please check the data folder.'
-        else:
-            self._metainfo['classes'] = tuple(classes)
-
        if empty_classes:
            logger = MMLogger.get_current_instance()
            logger.warning(
@ -211,24 +256,29 @@ class CustomDataset(BaseDataset):
                f'{", ".join(empty_classes)}. '
                f"Supported extensions are: {', '.join(self.extensions)}")

-        self.folder_to_idx = folder_to_idx
-
        return samples

    def load_data_list(self):
        """Load image paths and gt_labels."""
        if not self.ann_file:
            samples = self._find_samples()
-        else:
+        elif self.with_label:
            lines = list_from_file(self.ann_file)
            samples = [x.strip().rsplit(' ', 1) for x in lines]
+        else:
+            samples = list_from_file(self.ann_file)

        # Pre-build file backend to prevent verbose file backend inference.
        backend = get_file_backend(self.img_prefix, enable_singleton=True)
        data_list = []
-        for filename, gt_label in samples:
-            img_path = backend.join_path(self.img_prefix, filename)
-            info = {'img_path': img_path, 'gt_label': int(gt_label)}
+        for sample in samples:
+            if self.with_label:
+                filename, gt_label = sample
+                img_path = backend.join_path(self.img_prefix, filename)
+                info = {'img_path': img_path, 'gt_label': int(gt_label)}
+            else:
+                img_path = backend.join_path(self.img_prefix, sample)
+                info = {'img_path': img_path}
            data_list.append(info)
        return data_list

--- a/mmpretrain/datasets/imagenet.py
+++ b/mmpretrain/datasets/imagenet.py
@ -16,12 +16,12 @@ class ImageNet(CustomDataset):
    found in :class:`CustomDataset`.

    Args:
-        ann_file (str): Annotation file path. Defaults to ''.
-        metainfo (dict, optional): Meta information for dataset, such as class
-            information. Defaults to None.
        data_root (str): The root directory for ``data_prefix`` and
            ``ann_file``. Defaults to ''.
        data_prefix (str | dict): Prefix for training data. Defaults to ''.
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
        **kwargs: Other keyword arguments in :class:`CustomDataset` and
            :class:`BaseDataset`.
    """  # noqa: E501
@ -30,17 +30,17 @@ class ImageNet(CustomDataset):
    METAINFO = {'classes': IMAGENET_CATEGORIES}

    def __init__(self,
-                 ann_file: str = '',
-                 metainfo: Optional[dict] = None,
                 data_root: str = '',
                 data_prefix: Union[str, dict] = '',
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
                 **kwargs):
        kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
        super().__init__(
-            ann_file=ann_file,
-            metainfo=metainfo,
            data_root=data_root,
            data_prefix=data_prefix,
+            ann_file=ann_file,
+            metainfo=metainfo,
            **kwargs)


@ -53,12 +53,12 @@ class ImageNet21k(CustomDataset):
    specify it from the ``classes`` argument.

    Args:
-        ann_file (str): Annotation file path. Defaults to ''.
-        metainfo (dict, optional): Meta information for dataset, such as class
-            information. Defaults to None.
        data_root (str): The root directory for ``data_prefix`` and
            ``ann_file``. Defaults to ''.
        data_prefix (str | dict): Prefix for training data. Defaults to ''.
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
        multi_label (bool): Not implement by now. Use multi label or not.
            Defaults to False.
        **kwargs: Other keyword arguments in :class:`CustomDataset` and
@ -68,10 +68,10 @@ class ImageNet21k(CustomDataset):
    IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif')

    def __init__(self,
-                 ann_file: str = '',
-                 metainfo: Optional[dict] = None,
                 data_root: str = '',
                 data_prefix: Union[str, dict] = '',
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
                 multi_label: bool = False,
                 **kwargs):
        if multi_label:
@ -89,10 +89,10 @@ class ImageNet21k(CustomDataset):

        kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
        super().__init__(
-            ann_file=ann_file,
-            metainfo=metainfo,
            data_root=data_root,
            data_prefix=data_prefix,
+            ann_file=ann_file,
+            metainfo=metainfo,
            **kwargs)

        if self.CLASSES is None:
--- a/mmpretrain/datasets/places205.py
+++ b/mmpretrain/datasets/places205.py
@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+from mmpretrain.registry import DATASETS
+from .categories import PLACES205_CATEGORIES
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class Places205(CustomDataset):
+    """`Places205 <http://places.csail.mit.edu/downloadData.html>`_ Dataset.
+
+    Args:
+        data_root (str): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to ''.
+        data_prefix (str | dict): Prefix for training data. Defaults
+            to ''.
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        **kwargs: Other keyword arguments in :class:`CustomDataset` and
+            :class:`BaseDataset`.
+    """
+
+    IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif')
+    METAINFO = {'classes': PLACES205_CATEGORIES}
+
+    def __init__(self,
+                 data_root: str = '',
+                 data_prefix: Union[str, dict] = '',
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 **kwargs):
+        kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
+        super().__init__(
+            data_root=data_root,
+            data_prefix=data_prefix,
+            ann_file=ann_file,
+            metainfo=metainfo,
+            **kwargs)
--- a/tests/data/dataset/3.jpeg
+++ b/tests/data/dataset/3.jpeg
--- a/tests/data/dataset/ann_without_labels.txt
+++ b/tests/data/dataset/ann_without_labels.txt
@ -0,0 +1,3 @@
+a/1.JPG
+b/2.jpeg
+b/subb/3.jpg
--- a/tests/test_datasets/test_datasets.py
+++ b/tests/test_datasets/test_datasets.py
@ -73,8 +73,6 @@ class TestBaseDataset(TestCase):
            num_classes = len(dataset.CLASSES)
            self.assertIn(f'Number of categories: \t{num_classes}',
                          repr(dataset))
-        else:
-            self.assertIn('The `CLASSES` meta info is not set.', repr(dataset))

        self.assertIn('Haven\'t been initialized', repr(dataset))
        dataset.full_init()
@ -148,6 +146,30 @@ class TestCustomDataset(TestBaseDataset):
                'gt_label': 1
            }.items())

+        # test load without ann_file and without labels
+        # (no specific folder structures)
+        cfg = {
+            **self.DEFAULT_ARGS,
+            'data_prefix': ASSETS_ROOT,
+            'ann_file': '',
+            'with_label': False,
+        }
+        dataset = dataset_class(**cfg)
+        self.assertEqual(len(dataset), 4)
+        self.assertIsNone(dataset.CLASSES, None)
+        self.assertGreaterEqual(
+            dataset.get_data_info(0).items(), {
+                'img_path': osp.join(ASSETS_ROOT, '3.jpeg'),
+            }.items())
+        self.assertGreaterEqual(
+            dataset.get_data_info(1).items(), {
+                'img_path': osp.join(ASSETS_ROOT, 'a', '1.JPG'),
+            }.items())
+        self.assertGreaterEqual(
+            dataset.get_data_info(3).items(), {
+                'img_path': osp.join(ASSETS_ROOT, 'b', 'subb', '3.jpg'),
+            }.items())
+
        # test ann_file assertion
        cfg = {
            **self.DEFAULT_ARGS,
@ -201,6 +223,27 @@ class TestCustomDataset(TestBaseDataset):
                'gt_label': 1
            }.items())

+        # test load with absolute ann_file and without label
+        cfg = {
+            **self.DEFAULT_ARGS,
+            'data_root': '',
+            'data_prefix': '',
+            'ann_file': osp.join(ASSETS_ROOT, 'ann_without_labels.txt'),
+            'with_label': False,
+        }
+        dataset = dataset_class(**cfg)
+        self.assertEqual(len(dataset), 3)
+        # custom dataset won't infer CLASSES from ann_file
+        self.assertIsNone(dataset.CLASSES, None)
+        self.assertGreaterEqual(
+            dataset.get_data_info(0).items(), {
+                'img_path': 'a/1.JPG',
+            }.items())
+        self.assertGreaterEqual(
+            dataset.get_data_info(2).items(), {
+                'img_path': 'b/subb/3.jpg',
+            }.items())
+
        # test extensions filter
        cfg = {
            **self.DEFAULT_ARGS, 'data_prefix': dict(img_path=ASSETS_ROOT),
@ -302,6 +345,36 @@ class TestImageNet21k(TestCustomDataset):
        self.assertIn('specify the `classes`', log.output[0])


+class TestPlaces205(TestCustomDataset):
+    DATASET_TYPE = 'Places205'
+
+    DEFAULT_ARGS = dict(data_root=ASSETS_ROOT, ann_file='ann.txt')
+
+    def test_load_data_list(self):
+        dataset_class = DATASETS.get(self.DATASET_TYPE)
+
+        # test classes number
+        cfg = {
+            **self.DEFAULT_ARGS,
+            'data_prefix': ASSETS_ROOT,
+            'ann_file': '',
+        }
+        with self.assertRaisesRegex(AssertionError,
+                                    r"\(2\) doesn't match .* classes \(205\)"):
+            dataset_class(**cfg)
+
+        # test override classes
+        cfg = {
+            **self.DEFAULT_ARGS,
+            'data_prefix': ASSETS_ROOT,
+            'classes': ['cat', 'dog'],
+            'ann_file': '',
+        }
+        dataset = dataset_class(**cfg)
+        self.assertEqual(len(dataset), 3)
+        self.assertEqual(dataset.CLASSES, ('cat', 'dog'))
+
+
 class TestCIFAR10(TestBaseDataset):
    DATASET_TYPE = 'CIFAR10'