diff --git a/mmpretrain/datasets/__init__.py b/mmpretrain/datasets/__init__.py index 93059ae4e..f1cadc293 100644 --- a/mmpretrain/datasets/__init__.py +++ b/mmpretrain/datasets/__init__.py @@ -10,6 +10,7 @@ from .inshop import InShop from .mnist import MNIST, FashionMNIST from .multi_label import MultiLabelDataset from .multi_task import MultiTaskDataset +from .places205 import Places205 from .samplers import * # noqa: F401,F403 from .transforms import * # noqa: F401,F403 from .voc import VOC @@ -17,5 +18,6 @@ from .voc import VOC __all__ = [ 'BaseDataset', 'ImageNet', 'CIFAR10', 'CIFAR100', 'MNIST', 'FashionMNIST', 'VOC', 'build_dataset', 'ImageNet21k', 'KFoldDataset', 'CUB', - 'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset', 'InShop' + 'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset', 'InShop', + 'Places205' ] diff --git a/mmpretrain/datasets/base_dataset.py b/mmpretrain/datasets/base_dataset.py index bd8763503..ab9f8e5ac 100644 --- a/mmpretrain/datasets/base_dataset.py +++ b/mmpretrain/datasets/base_dataset.py @@ -198,8 +198,6 @@ class BaseDataset(_BaseDataset): if self.CLASSES is not None: body.append(f'Number of categories: \t{len(self.CLASSES)}') - else: - body.append('The `CLASSES` meta info is not set.') body.extend(self.extra_repr()) diff --git a/mmpretrain/datasets/categories.py b/mmpretrain/datasets/categories.py index 03ea58f43..2ef85346a 100644 --- a/mmpretrain/datasets/categories.py +++ b/mmpretrain/datasets/categories.py @@ -1096,3 +1096,48 @@ MNIST_CATEGORITES = ('0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', FASHIONMNIST_CATEGORITES = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot') + +PLACES205_CATEGORIES = ( + 'abbey', 'airport_terminal', 'alley', 'amphitheater', 'amusement_park', + 'aquarium', 'aqueduct', 'arch', 'art_gallery', 'art_studio', + 'assembly_line', 'attic', 'auditorium', 'apartment_building/outdoor', + 'badlands', 'ballroom', 'bamboo_forest', 'banquet_hall', 'bar', + 'baseball_field', 'basement', 'basilica', 'bayou', 'beauty_salon', + 'bedroom', 'boardwalk', 'boat_deck', 'bookstore', 'botanical_garden', + 'bowling_alley', 'boxing_ring', 'bridge', 'building_facade', + 'bus_interior', 'butchers_shop', 'butte', 'bakery/shop', 'cafeteria', + 'campsite', 'candy_store', 'canyon', 'castle', 'cemetery', 'chalet', + 'classroom', 'closet', 'clothing_store', 'coast', 'cockpit', 'coffee_shop', + 'conference_center', 'conference_room', 'construction_site', 'corn_field', + 'corridor', 'cottage_garden', 'courthouse', 'courtyard', 'creek', + 'crevasse', 'crosswalk', 'cathedral/outdoor', 'church/outdoor', 'dam', + 'dining_room', 'dock', 'dorm_room', 'driveway', 'desert/sand', + 'desert/vegetation', 'dinette/home', 'doorway/outdoor', 'engine_room', + 'excavation', 'fairway', 'fire_escape', 'fire_station', 'food_court', + 'forest_path', 'forest_road', 'formal_garden', 'fountain', + 'field/cultivated', 'field/wild', 'galley', 'game_room', 'garbage_dump', + 'gas_station', 'gift_shop', 'golf_course', 'harbor', 'herb_garden', + 'highway', 'home_office', 'hospital', 'hospital_room', 'hot_spring', + 'hotel_room', 'hotel/outdoor', 'ice_cream_parlor', 'iceberg', 'igloo', + 'islet', 'ice_skating_rink/outdoor', 'inn/outdoor', 'jail_cell', 'kasbah', + 'kindergarden_classroom', 'kitchen', 'kitchenette', 'laundromat', + 'lighthouse', 'living_room', 'lobby', 'locker_room', 'mansion', 'marsh', + 'martial_arts_gym', 'mausoleum', 'medina', 'motel', 'mountain', + 'mountain_snowy', 'music_studio', 'market/outdoor', 'monastery/outdoor', + 'museum/indoor', 'nursery', 'ocean', 'office', 'office_building', + 'orchard', 'pagoda', 'palace', 'pantry', 'parking_lot', 'parlor', + 'pasture', 'patio', 'pavilion', 'phone_booth', 'picnic_area', 'playground', + 'plaza', 'pond', 'pulpit', 'racecourse', 'raft', 'railroad_track', + 'rainforest', 'reception', 'residential_neighborhood', 'restaurant', + 'restaurant_kitchen', 'restaurant_patio', 'rice_paddy', 'river', + 'rock_arch', 'rope_bridge', 'ruin', 'runway', 'sandbar', 'schoolhouse', + 'sea_cliff', 'shed', 'shoe_shop', 'shopfront', 'shower', 'ski_resort', + 'ski_slope', 'sky', 'skyscraper', 'slum', 'snowfield', 'staircase', + 'supermarket', 'swamp', 'stadium/baseball', 'stadium/football', + 'stage/indoor', 'subway_station/platform', 'swimming_pool/outdoor', + 'television_studio', 'topiary_garden', 'tower', 'train_railway', + 'tree_farm', 'trench', 'temple/east_asia', 'temple/south_asia', + 'track/outdoor', 'train_station/platform', 'underwater/coral_reef', + 'valley', 'vegetable_garden', 'veranda', 'viaduct', 'volcano', + 'waiting_room', 'water_tower', 'watering_hole', 'wheat_field', 'wind_farm', + 'windmill', 'yard') diff --git a/mmpretrain/datasets/custom.py b/mmpretrain/datasets/custom.py index 3ed40b3d3..bb491ff0c 100644 --- a/mmpretrain/datasets/custom.py +++ b/mmpretrain/datasets/custom.py @@ -67,59 +67,78 @@ def get_samples( # Pre-build file backend to prevent verbose file backend inference. backend = backend or get_file_backend(root, enable_singleton=True) - for folder_name in sorted(list(folder_to_idx.keys())): - _dir = backend.join_path(root, folder_name) + if folder_to_idx is not None: + for folder_name in sorted(list(folder_to_idx.keys())): + _dir = backend.join_path(root, folder_name) + files = backend.list_dir_or_file( + _dir, + list_dir=False, + list_file=True, + recursive=True, + ) + for file in sorted(list(files)): + if is_valid_file(file): + path = backend.join_path(folder_name, file) + item = (path, folder_to_idx[folder_name]) + samples.append(item) + available_classes.add(folder_name) + empty_folders = set(folder_to_idx.keys()) - available_classes + else: files = backend.list_dir_or_file( - _dir, + root, list_dir=False, list_file=True, recursive=True, ) - for file in sorted(list(files)): - if is_valid_file(file): - path = backend.join_path(folder_name, file) - item = (path, folder_to_idx[folder_name]) - samples.append(item) - available_classes.add(folder_name) - - empty_folders = set(folder_to_idx.keys()) - available_classes + samples = [file for file in sorted(list(files)) if is_valid_file(file)] + empty_folders = None return samples, empty_folders @DATASETS.register_module() class CustomDataset(BaseDataset): - """Custom dataset for classification. + """A generic dataset for multiple tasks. - The dataset supports two kinds of annotation format. + The dataset supports two kinds of style. - 1. An annotation file is provided, and each line indicates a sample: + 1. Use an annotation file to specify all samples, and each line indicates a + sample: - The sample files: :: + The annotation file (for ``with_label=True``, supervised tasks.): :: + + folder_1/xxx.png 0 + folder_1/xxy.png 1 + 123.png 4 + nsdf3.png 3 + ... + + The annotation file (for ``with_label=False``, unsupervised tasks.): :: + + folder_1/xxx.png + folder_1/xxy.png + 123.png + nsdf3.png + ... + + Sample files: :: data_prefix/ ├── folder_1 │ ├── xxx.png │ ├── xxy.png │ └── ... - └── folder_2 - ├── 123.png - ├── nsdf3.png - └── ... + ├── 123.png + ├── nsdf3.png + └── ... - The annotation file (the first column is the image path and the second - column is the index of category): :: + Please use the argument ``metainfo`` to specify extra information for + the task, like ``{'classes': ('bird', 'cat', 'deer', 'dog', 'frog')}``. - folder_1/xxx.png 0 - folder_1/xxy.png 1 - folder_2/123.png 5 - folder_2/nsdf3.png 3 - ... + 2. Place all samples in one folder as below: - Please specify the name of categories by the argument ``classes`` - or ``metainfo``. - - 2. The samples are arranged in the specific way: :: + Sample files (for ``with_label=True``, supervised tasks, we use the name + of sub-folders as the categories names): :: data_prefix/ ├── class_x @@ -133,18 +152,33 @@ class CustomDataset(BaseDataset): ├── ... └── asd932_.png + Sample files (for ``with_label=False``, unsupervised tasks, we use all + sample files under the specified folder): :: + + data_prefix/ + ├── folder_1 + │ ├── xxx.png + │ ├── xxy.png + │ └── ... + ├── 123.png + ├── nsdf3.png + └── ... + If the ``ann_file`` is specified, the dataset will be generated by the first way, otherwise, try the second way. Args: - ann_file (str): Annotation file path. Defaults to ''. - metainfo (dict, optional): Meta information for dataset, such as class - information. Defaults to None. data_root (str): The root directory for ``data_prefix`` and ``ann_file``. Defaults to ''. data_prefix (str | dict): Prefix for the data. Defaults to ''. + ann_file (str): Annotation file path. Defaults to ''. + with_label (bool): Whether the annotation file includes ground truth + labels, or use sub-folders to specify categories. + Defaults to True. extensions (Sequence[str]): A sequence of allowed extensions. Defaults to ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif'). + metainfo (dict, optional): Meta information for dataset, such as class + information. Defaults to None. lazy_init (bool): Whether to load annotation during instantiation. In some cases, such as visualization, only the meta information of the dataset is needed, which is not necessary to load annotation @@ -154,12 +188,13 @@ class CustomDataset(BaseDataset): """ def __init__(self, - ann_file: str = '', - metainfo: Optional[dict] = None, data_root: str = '', data_prefix: Union[str, dict] = '', + ann_file: str = '', + with_label=True, extensions: Sequence[str] = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif'), + metainfo: Optional[dict] = None, lazy_init: bool = False, **kwargs): assert (ann_file or data_prefix or data_root), \ @@ -167,6 +202,7 @@ class CustomDataset(BaseDataset): 'be specified.' self.extensions = tuple(set([i.lower() for i in extensions])) + self.with_label = with_label super().__init__( # The base class requires string ann_file but this class doesn't @@ -184,26 +220,35 @@ class CustomDataset(BaseDataset): def _find_samples(self): """find samples from ``data_prefix``.""" - classes, folder_to_idx = find_folders(self.img_prefix) - samples, empty_classes = get_samples( - self.img_prefix, - folder_to_idx, - is_valid_file=self.is_valid_file, - ) + if self.with_label: + classes, folder_to_idx = find_folders(self.img_prefix) + samples, empty_classes = get_samples( + self.img_prefix, + folder_to_idx, + is_valid_file=self.is_valid_file, + ) + + self.folder_to_idx = folder_to_idx + + if self.CLASSES is not None: + assert len(self.CLASSES) == len(classes), \ + f"The number of subfolders ({len(classes)}) doesn't " \ + f'match the number of specified classes ' \ + f'({len(self.CLASSES)}). Please check the data folder.' + else: + self._metainfo['classes'] = tuple(classes) + else: + samples, empty_classes = get_samples( + self.img_prefix, + None, + is_valid_file=self.is_valid_file, + ) if len(samples) == 0: raise RuntimeError( f'Found 0 files in subfolders of: {self.data_prefix}. ' f'Supported extensions are: {",".join(self.extensions)}') - if self.CLASSES is not None: - assert len(self.CLASSES) == len(classes), \ - f"The number of subfolders ({len(classes)}) doesn't match " \ - f'the number of specified classes ({len(self.CLASSES)}). ' \ - 'Please check the data folder.' - else: - self._metainfo['classes'] = tuple(classes) - if empty_classes: logger = MMLogger.get_current_instance() logger.warning( @@ -211,24 +256,29 @@ class CustomDataset(BaseDataset): f'{", ".join(empty_classes)}. ' f"Supported extensions are: {', '.join(self.extensions)}") - self.folder_to_idx = folder_to_idx - return samples def load_data_list(self): """Load image paths and gt_labels.""" if not self.ann_file: samples = self._find_samples() - else: + elif self.with_label: lines = list_from_file(self.ann_file) samples = [x.strip().rsplit(' ', 1) for x in lines] + else: + samples = list_from_file(self.ann_file) # Pre-build file backend to prevent verbose file backend inference. backend = get_file_backend(self.img_prefix, enable_singleton=True) data_list = [] - for filename, gt_label in samples: - img_path = backend.join_path(self.img_prefix, filename) - info = {'img_path': img_path, 'gt_label': int(gt_label)} + for sample in samples: + if self.with_label: + filename, gt_label = sample + img_path = backend.join_path(self.img_prefix, filename) + info = {'img_path': img_path, 'gt_label': int(gt_label)} + else: + img_path = backend.join_path(self.img_prefix, sample) + info = {'img_path': img_path} data_list.append(info) return data_list diff --git a/mmpretrain/datasets/imagenet.py b/mmpretrain/datasets/imagenet.py index e1a8619ef..e309d3af7 100644 --- a/mmpretrain/datasets/imagenet.py +++ b/mmpretrain/datasets/imagenet.py @@ -16,12 +16,12 @@ class ImageNet(CustomDataset): found in :class:`CustomDataset`. Args: - ann_file (str): Annotation file path. Defaults to ''. - metainfo (dict, optional): Meta information for dataset, such as class - information. Defaults to None. data_root (str): The root directory for ``data_prefix`` and ``ann_file``. Defaults to ''. data_prefix (str | dict): Prefix for training data. Defaults to ''. + ann_file (str): Annotation file path. Defaults to ''. + metainfo (dict, optional): Meta information for dataset, such as class + information. Defaults to None. **kwargs: Other keyword arguments in :class:`CustomDataset` and :class:`BaseDataset`. """ # noqa: E501 @@ -30,17 +30,17 @@ class ImageNet(CustomDataset): METAINFO = {'classes': IMAGENET_CATEGORIES} def __init__(self, - ann_file: str = '', - metainfo: Optional[dict] = None, data_root: str = '', data_prefix: Union[str, dict] = '', + ann_file: str = '', + metainfo: Optional[dict] = None, **kwargs): kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs} super().__init__( - ann_file=ann_file, - metainfo=metainfo, data_root=data_root, data_prefix=data_prefix, + ann_file=ann_file, + metainfo=metainfo, **kwargs) @@ -53,12 +53,12 @@ class ImageNet21k(CustomDataset): specify it from the ``classes`` argument. Args: - ann_file (str): Annotation file path. Defaults to ''. - metainfo (dict, optional): Meta information for dataset, such as class - information. Defaults to None. data_root (str): The root directory for ``data_prefix`` and ``ann_file``. Defaults to ''. data_prefix (str | dict): Prefix for training data. Defaults to ''. + ann_file (str): Annotation file path. Defaults to ''. + metainfo (dict, optional): Meta information for dataset, such as class + information. Defaults to None. multi_label (bool): Not implement by now. Use multi label or not. Defaults to False. **kwargs: Other keyword arguments in :class:`CustomDataset` and @@ -68,10 +68,10 @@ class ImageNet21k(CustomDataset): IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif') def __init__(self, - ann_file: str = '', - metainfo: Optional[dict] = None, data_root: str = '', data_prefix: Union[str, dict] = '', + ann_file: str = '', + metainfo: Optional[dict] = None, multi_label: bool = False, **kwargs): if multi_label: @@ -89,10 +89,10 @@ class ImageNet21k(CustomDataset): kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs} super().__init__( - ann_file=ann_file, - metainfo=metainfo, data_root=data_root, data_prefix=data_prefix, + ann_file=ann_file, + metainfo=metainfo, **kwargs) if self.CLASSES is None: diff --git a/mmpretrain/datasets/places205.py b/mmpretrain/datasets/places205.py new file mode 100644 index 000000000..f3ba1ff63 --- /dev/null +++ b/mmpretrain/datasets/places205.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +from mmpretrain.registry import DATASETS +from .categories import PLACES205_CATEGORIES +from .custom import CustomDataset + + +@DATASETS.register_module() +class Places205(CustomDataset): + """`Places205 `_ Dataset. + + Args: + data_root (str): The root directory for ``data_prefix`` and + ``ann_file``. Defaults to ''. + data_prefix (str | dict): Prefix for training data. Defaults + to ''. + ann_file (str): Annotation file path. Defaults to ''. + metainfo (dict, optional): Meta information for dataset, such as class + information. Defaults to None. + **kwargs: Other keyword arguments in :class:`CustomDataset` and + :class:`BaseDataset`. + """ + + IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif') + METAINFO = {'classes': PLACES205_CATEGORIES} + + def __init__(self, + data_root: str = '', + data_prefix: Union[str, dict] = '', + ann_file: str = '', + metainfo: Optional[dict] = None, + **kwargs): + kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs} + super().__init__( + data_root=data_root, + data_prefix=data_prefix, + ann_file=ann_file, + metainfo=metainfo, + **kwargs) diff --git a/tests/data/dataset/3.jpeg b/tests/data/dataset/3.jpeg new file mode 100644 index 000000000..e69de29bb diff --git a/tests/data/dataset/ann_without_labels.txt b/tests/data/dataset/ann_without_labels.txt new file mode 100644 index 000000000..ea467ca52 --- /dev/null +++ b/tests/data/dataset/ann_without_labels.txt @@ -0,0 +1,3 @@ +a/1.JPG +b/2.jpeg +b/subb/3.jpg diff --git a/tests/test_datasets/test_datasets.py b/tests/test_datasets/test_datasets.py index 488750eff..a21976a1b 100644 --- a/tests/test_datasets/test_datasets.py +++ b/tests/test_datasets/test_datasets.py @@ -73,8 +73,6 @@ class TestBaseDataset(TestCase): num_classes = len(dataset.CLASSES) self.assertIn(f'Number of categories: \t{num_classes}', repr(dataset)) - else: - self.assertIn('The `CLASSES` meta info is not set.', repr(dataset)) self.assertIn('Haven\'t been initialized', repr(dataset)) dataset.full_init() @@ -148,6 +146,30 @@ class TestCustomDataset(TestBaseDataset): 'gt_label': 1 }.items()) + # test load without ann_file and without labels + # (no specific folder structures) + cfg = { + **self.DEFAULT_ARGS, + 'data_prefix': ASSETS_ROOT, + 'ann_file': '', + 'with_label': False, + } + dataset = dataset_class(**cfg) + self.assertEqual(len(dataset), 4) + self.assertIsNone(dataset.CLASSES, None) + self.assertGreaterEqual( + dataset.get_data_info(0).items(), { + 'img_path': osp.join(ASSETS_ROOT, '3.jpeg'), + }.items()) + self.assertGreaterEqual( + dataset.get_data_info(1).items(), { + 'img_path': osp.join(ASSETS_ROOT, 'a', '1.JPG'), + }.items()) + self.assertGreaterEqual( + dataset.get_data_info(3).items(), { + 'img_path': osp.join(ASSETS_ROOT, 'b', 'subb', '3.jpg'), + }.items()) + # test ann_file assertion cfg = { **self.DEFAULT_ARGS, @@ -201,6 +223,27 @@ class TestCustomDataset(TestBaseDataset): 'gt_label': 1 }.items()) + # test load with absolute ann_file and without label + cfg = { + **self.DEFAULT_ARGS, + 'data_root': '', + 'data_prefix': '', + 'ann_file': osp.join(ASSETS_ROOT, 'ann_without_labels.txt'), + 'with_label': False, + } + dataset = dataset_class(**cfg) + self.assertEqual(len(dataset), 3) + # custom dataset won't infer CLASSES from ann_file + self.assertIsNone(dataset.CLASSES, None) + self.assertGreaterEqual( + dataset.get_data_info(0).items(), { + 'img_path': 'a/1.JPG', + }.items()) + self.assertGreaterEqual( + dataset.get_data_info(2).items(), { + 'img_path': 'b/subb/3.jpg', + }.items()) + # test extensions filter cfg = { **self.DEFAULT_ARGS, 'data_prefix': dict(img_path=ASSETS_ROOT), @@ -302,6 +345,36 @@ class TestImageNet21k(TestCustomDataset): self.assertIn('specify the `classes`', log.output[0]) +class TestPlaces205(TestCustomDataset): + DATASET_TYPE = 'Places205' + + DEFAULT_ARGS = dict(data_root=ASSETS_ROOT, ann_file='ann.txt') + + def test_load_data_list(self): + dataset_class = DATASETS.get(self.DATASET_TYPE) + + # test classes number + cfg = { + **self.DEFAULT_ARGS, + 'data_prefix': ASSETS_ROOT, + 'ann_file': '', + } + with self.assertRaisesRegex(AssertionError, + r"\(2\) doesn't match .* classes \(205\)"): + dataset_class(**cfg) + + # test override classes + cfg = { + **self.DEFAULT_ARGS, + 'data_prefix': ASSETS_ROOT, + 'classes': ['cat', 'dog'], + 'ann_file': '', + } + dataset = dataset_class(**cfg) + self.assertEqual(len(dataset), 3) + self.assertEqual(dataset.CLASSES, ('cat', 'dog')) + + class TestCIFAR10(TestBaseDataset): DATASET_TYPE = 'CIFAR10'