[Refactor] Update datasets (#1375)

* add ut

* add places205

* support ann_file without labels

* temp test

* update custom

* update

* update ut

* Update CustomDataset.

* Update Places205.

---------

Co-authored-by: mzr1996 <mzr1996@163.com>
This commit is contained in:
Yixiao Fang 2023-02-27 15:42:22 +08:00 committed by GitHub
parent 89000c10eb
commit 75c79311f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 285 additions and 74 deletions

View File

@ -10,6 +10,7 @@ from .inshop import InShop
from .mnist import MNIST, FashionMNIST
from .multi_label import MultiLabelDataset
from .multi_task import MultiTaskDataset
from .places205 import Places205
from .samplers import * # noqa: F401,F403
from .transforms import * # noqa: F401,F403
from .voc import VOC
@ -17,5 +18,6 @@ from .voc import VOC
__all__ = [
'BaseDataset', 'ImageNet', 'CIFAR10', 'CIFAR100', 'MNIST', 'FashionMNIST',
'VOC', 'build_dataset', 'ImageNet21k', 'KFoldDataset', 'CUB',
'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset', 'InShop'
'CustomDataset', 'MultiLabelDataset', 'MultiTaskDataset', 'InShop',
'Places205'
]

View File

@ -198,8 +198,6 @@ class BaseDataset(_BaseDataset):
if self.CLASSES is not None:
body.append(f'Number of categories: \t{len(self.CLASSES)}')
else:
body.append('The `CLASSES` meta info is not set.')
body.extend(self.extra_repr())

View File

@ -1096,3 +1096,48 @@ MNIST_CATEGORITES = ('0 - zero', '1 - one', '2 - two', '3 - three', '4 - four',
FASHIONMNIST_CATEGORITES = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress',
'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag',
'Ankle boot')
PLACES205_CATEGORIES = (
'abbey', 'airport_terminal', 'alley', 'amphitheater', 'amusement_park',
'aquarium', 'aqueduct', 'arch', 'art_gallery', 'art_studio',
'assembly_line', 'attic', 'auditorium', 'apartment_building/outdoor',
'badlands', 'ballroom', 'bamboo_forest', 'banquet_hall', 'bar',
'baseball_field', 'basement', 'basilica', 'bayou', 'beauty_salon',
'bedroom', 'boardwalk', 'boat_deck', 'bookstore', 'botanical_garden',
'bowling_alley', 'boxing_ring', 'bridge', 'building_facade',
'bus_interior', 'butchers_shop', 'butte', 'bakery/shop', 'cafeteria',
'campsite', 'candy_store', 'canyon', 'castle', 'cemetery', 'chalet',
'classroom', 'closet', 'clothing_store', 'coast', 'cockpit', 'coffee_shop',
'conference_center', 'conference_room', 'construction_site', 'corn_field',
'corridor', 'cottage_garden', 'courthouse', 'courtyard', 'creek',
'crevasse', 'crosswalk', 'cathedral/outdoor', 'church/outdoor', 'dam',
'dining_room', 'dock', 'dorm_room', 'driveway', 'desert/sand',
'desert/vegetation', 'dinette/home', 'doorway/outdoor', 'engine_room',
'excavation', 'fairway', 'fire_escape', 'fire_station', 'food_court',
'forest_path', 'forest_road', 'formal_garden', 'fountain',
'field/cultivated', 'field/wild', 'galley', 'game_room', 'garbage_dump',
'gas_station', 'gift_shop', 'golf_course', 'harbor', 'herb_garden',
'highway', 'home_office', 'hospital', 'hospital_room', 'hot_spring',
'hotel_room', 'hotel/outdoor', 'ice_cream_parlor', 'iceberg', 'igloo',
'islet', 'ice_skating_rink/outdoor', 'inn/outdoor', 'jail_cell', 'kasbah',
'kindergarden_classroom', 'kitchen', 'kitchenette', 'laundromat',
'lighthouse', 'living_room', 'lobby', 'locker_room', 'mansion', 'marsh',
'martial_arts_gym', 'mausoleum', 'medina', 'motel', 'mountain',
'mountain_snowy', 'music_studio', 'market/outdoor', 'monastery/outdoor',
'museum/indoor', 'nursery', 'ocean', 'office', 'office_building',
'orchard', 'pagoda', 'palace', 'pantry', 'parking_lot', 'parlor',
'pasture', 'patio', 'pavilion', 'phone_booth', 'picnic_area', 'playground',
'plaza', 'pond', 'pulpit', 'racecourse', 'raft', 'railroad_track',
'rainforest', 'reception', 'residential_neighborhood', 'restaurant',
'restaurant_kitchen', 'restaurant_patio', 'rice_paddy', 'river',
'rock_arch', 'rope_bridge', 'ruin', 'runway', 'sandbar', 'schoolhouse',
'sea_cliff', 'shed', 'shoe_shop', 'shopfront', 'shower', 'ski_resort',
'ski_slope', 'sky', 'skyscraper', 'slum', 'snowfield', 'staircase',
'supermarket', 'swamp', 'stadium/baseball', 'stadium/football',
'stage/indoor', 'subway_station/platform', 'swimming_pool/outdoor',
'television_studio', 'topiary_garden', 'tower', 'train_railway',
'tree_farm', 'trench', 'temple/east_asia', 'temple/south_asia',
'track/outdoor', 'train_station/platform', 'underwater/coral_reef',
'valley', 'vegetable_garden', 'veranda', 'viaduct', 'volcano',
'waiting_room', 'water_tower', 'watering_hole', 'wheat_field', 'wind_farm',
'windmill', 'yard')

View File

@ -67,59 +67,78 @@ def get_samples(
# Pre-build file backend to prevent verbose file backend inference.
backend = backend or get_file_backend(root, enable_singleton=True)
for folder_name in sorted(list(folder_to_idx.keys())):
_dir = backend.join_path(root, folder_name)
if folder_to_idx is not None:
for folder_name in sorted(list(folder_to_idx.keys())):
_dir = backend.join_path(root, folder_name)
files = backend.list_dir_or_file(
_dir,
list_dir=False,
list_file=True,
recursive=True,
)
for file in sorted(list(files)):
if is_valid_file(file):
path = backend.join_path(folder_name, file)
item = (path, folder_to_idx[folder_name])
samples.append(item)
available_classes.add(folder_name)
empty_folders = set(folder_to_idx.keys()) - available_classes
else:
files = backend.list_dir_or_file(
_dir,
root,
list_dir=False,
list_file=True,
recursive=True,
)
for file in sorted(list(files)):
if is_valid_file(file):
path = backend.join_path(folder_name, file)
item = (path, folder_to_idx[folder_name])
samples.append(item)
available_classes.add(folder_name)
empty_folders = set(folder_to_idx.keys()) - available_classes
samples = [file for file in sorted(list(files)) if is_valid_file(file)]
empty_folders = None
return samples, empty_folders
@DATASETS.register_module()
class CustomDataset(BaseDataset):
"""Custom dataset for classification.
"""A generic dataset for multiple tasks.
The dataset supports two kinds of annotation format.
The dataset supports two kinds of style.
1. An annotation file is provided, and each line indicates a sample:
1. Use an annotation file to specify all samples, and each line indicates a
sample:
The sample files: ::
The annotation file (for ``with_label=True``, supervised tasks.): ::
folder_1/xxx.png 0
folder_1/xxy.png 1
123.png 4
nsdf3.png 3
...
The annotation file (for ``with_label=False``, unsupervised tasks.): ::
folder_1/xxx.png
folder_1/xxy.png
123.png
nsdf3.png
...
Sample files: ::
data_prefix/
folder_1
xxx.png
xxy.png
...
folder_2
123.png
nsdf3.png
...
123.png
nsdf3.png
...
The annotation file (the first column is the image path and the second
column is the index of category): ::
Please use the argument ``metainfo`` to specify extra information for
the task, like ``{'classes': ('bird', 'cat', 'deer', 'dog', 'frog')}``.
folder_1/xxx.png 0
folder_1/xxy.png 1
folder_2/123.png 5
folder_2/nsdf3.png 3
...
2. Place all samples in one folder as below:
Please specify the name of categories by the argument ``classes``
or ``metainfo``.
2. The samples are arranged in the specific way: ::
Sample files (for ``with_label=True``, supervised tasks, we use the name
of sub-folders as the categories names): ::
data_prefix/
class_x
@ -133,18 +152,33 @@ class CustomDataset(BaseDataset):
...
asd932_.png
Sample files (for ``with_label=False``, unsupervised tasks, we use all
sample files under the specified folder): ::
data_prefix/
folder_1
xxx.png
xxy.png
...
123.png
nsdf3.png
...
If the ``ann_file`` is specified, the dataset will be generated by the
first way, otherwise, try the second way.
Args:
ann_file (str): Annotation file path. Defaults to ''.
metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None.
data_root (str): The root directory for ``data_prefix`` and
``ann_file``. Defaults to ''.
data_prefix (str | dict): Prefix for the data. Defaults to ''.
ann_file (str): Annotation file path. Defaults to ''.
with_label (bool): Whether the annotation file includes ground truth
labels, or use sub-folders to specify categories.
Defaults to True.
extensions (Sequence[str]): A sequence of allowed extensions. Defaults
to ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif').
metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None.
lazy_init (bool): Whether to load annotation during instantiation.
In some cases, such as visualization, only the meta information of
the dataset is needed, which is not necessary to load annotation
@ -154,12 +188,13 @@ class CustomDataset(BaseDataset):
"""
def __init__(self,
ann_file: str = '',
metainfo: Optional[dict] = None,
data_root: str = '',
data_prefix: Union[str, dict] = '',
ann_file: str = '',
with_label=True,
extensions: Sequence[str] = ('.jpg', '.jpeg', '.png', '.ppm',
'.bmp', '.pgm', '.tif'),
metainfo: Optional[dict] = None,
lazy_init: bool = False,
**kwargs):
assert (ann_file or data_prefix or data_root), \
@ -167,6 +202,7 @@ class CustomDataset(BaseDataset):
'be specified.'
self.extensions = tuple(set([i.lower() for i in extensions]))
self.with_label = with_label
super().__init__(
# The base class requires string ann_file but this class doesn't
@ -184,26 +220,35 @@ class CustomDataset(BaseDataset):
def _find_samples(self):
"""find samples from ``data_prefix``."""
classes, folder_to_idx = find_folders(self.img_prefix)
samples, empty_classes = get_samples(
self.img_prefix,
folder_to_idx,
is_valid_file=self.is_valid_file,
)
if self.with_label:
classes, folder_to_idx = find_folders(self.img_prefix)
samples, empty_classes = get_samples(
self.img_prefix,
folder_to_idx,
is_valid_file=self.is_valid_file,
)
self.folder_to_idx = folder_to_idx
if self.CLASSES is not None:
assert len(self.CLASSES) == len(classes), \
f"The number of subfolders ({len(classes)}) doesn't " \
f'match the number of specified classes ' \
f'({len(self.CLASSES)}). Please check the data folder.'
else:
self._metainfo['classes'] = tuple(classes)
else:
samples, empty_classes = get_samples(
self.img_prefix,
None,
is_valid_file=self.is_valid_file,
)
if len(samples) == 0:
raise RuntimeError(
f'Found 0 files in subfolders of: {self.data_prefix}. '
f'Supported extensions are: {",".join(self.extensions)}')
if self.CLASSES is not None:
assert len(self.CLASSES) == len(classes), \
f"The number of subfolders ({len(classes)}) doesn't match " \
f'the number of specified classes ({len(self.CLASSES)}). ' \
'Please check the data folder.'
else:
self._metainfo['classes'] = tuple(classes)
if empty_classes:
logger = MMLogger.get_current_instance()
logger.warning(
@ -211,24 +256,29 @@ class CustomDataset(BaseDataset):
f'{", ".join(empty_classes)}. '
f"Supported extensions are: {', '.join(self.extensions)}")
self.folder_to_idx = folder_to_idx
return samples
def load_data_list(self):
"""Load image paths and gt_labels."""
if not self.ann_file:
samples = self._find_samples()
else:
elif self.with_label:
lines = list_from_file(self.ann_file)
samples = [x.strip().rsplit(' ', 1) for x in lines]
else:
samples = list_from_file(self.ann_file)
# Pre-build file backend to prevent verbose file backend inference.
backend = get_file_backend(self.img_prefix, enable_singleton=True)
data_list = []
for filename, gt_label in samples:
img_path = backend.join_path(self.img_prefix, filename)
info = {'img_path': img_path, 'gt_label': int(gt_label)}
for sample in samples:
if self.with_label:
filename, gt_label = sample
img_path = backend.join_path(self.img_prefix, filename)
info = {'img_path': img_path, 'gt_label': int(gt_label)}
else:
img_path = backend.join_path(self.img_prefix, sample)
info = {'img_path': img_path}
data_list.append(info)
return data_list

View File

@ -16,12 +16,12 @@ class ImageNet(CustomDataset):
found in :class:`CustomDataset`.
Args:
ann_file (str): Annotation file path. Defaults to ''.
metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None.
data_root (str): The root directory for ``data_prefix`` and
``ann_file``. Defaults to ''.
data_prefix (str | dict): Prefix for training data. Defaults to ''.
ann_file (str): Annotation file path. Defaults to ''.
metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None.
**kwargs: Other keyword arguments in :class:`CustomDataset` and
:class:`BaseDataset`.
""" # noqa: E501
@ -30,17 +30,17 @@ class ImageNet(CustomDataset):
METAINFO = {'classes': IMAGENET_CATEGORIES}
def __init__(self,
ann_file: str = '',
metainfo: Optional[dict] = None,
data_root: str = '',
data_prefix: Union[str, dict] = '',
ann_file: str = '',
metainfo: Optional[dict] = None,
**kwargs):
kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
super().__init__(
ann_file=ann_file,
metainfo=metainfo,
data_root=data_root,
data_prefix=data_prefix,
ann_file=ann_file,
metainfo=metainfo,
**kwargs)
@ -53,12 +53,12 @@ class ImageNet21k(CustomDataset):
specify it from the ``classes`` argument.
Args:
ann_file (str): Annotation file path. Defaults to ''.
metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None.
data_root (str): The root directory for ``data_prefix`` and
``ann_file``. Defaults to ''.
data_prefix (str | dict): Prefix for training data. Defaults to ''.
ann_file (str): Annotation file path. Defaults to ''.
metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None.
multi_label (bool): Not implement by now. Use multi label or not.
Defaults to False.
**kwargs: Other keyword arguments in :class:`CustomDataset` and
@ -68,10 +68,10 @@ class ImageNet21k(CustomDataset):
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif')
def __init__(self,
ann_file: str = '',
metainfo: Optional[dict] = None,
data_root: str = '',
data_prefix: Union[str, dict] = '',
ann_file: str = '',
metainfo: Optional[dict] = None,
multi_label: bool = False,
**kwargs):
if multi_label:
@ -89,10 +89,10 @@ class ImageNet21k(CustomDataset):
kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
super().__init__(
ann_file=ann_file,
metainfo=metainfo,
data_root=data_root,
data_prefix=data_prefix,
ann_file=ann_file,
metainfo=metainfo,
**kwargs)
if self.CLASSES is None:

View File

@ -0,0 +1,40 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Union
from mmpretrain.registry import DATASETS
from .categories import PLACES205_CATEGORIES
from .custom import CustomDataset
@DATASETS.register_module()
class Places205(CustomDataset):
"""`Places205 <http://places.csail.mit.edu/downloadData.html>`_ Dataset.
Args:
data_root (str): The root directory for ``data_prefix`` and
``ann_file``. Defaults to ''.
data_prefix (str | dict): Prefix for training data. Defaults
to ''.
ann_file (str): Annotation file path. Defaults to ''.
metainfo (dict, optional): Meta information for dataset, such as class
information. Defaults to None.
**kwargs: Other keyword arguments in :class:`CustomDataset` and
:class:`BaseDataset`.
"""
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif')
METAINFO = {'classes': PLACES205_CATEGORIES}
def __init__(self,
data_root: str = '',
data_prefix: Union[str, dict] = '',
ann_file: str = '',
metainfo: Optional[dict] = None,
**kwargs):
kwargs = {'extensions': self.IMG_EXTENSIONS, **kwargs}
super().__init__(
data_root=data_root,
data_prefix=data_prefix,
ann_file=ann_file,
metainfo=metainfo,
**kwargs)

View File

View File

@ -0,0 +1,3 @@
a/1.JPG
b/2.jpeg
b/subb/3.jpg

View File

@ -73,8 +73,6 @@ class TestBaseDataset(TestCase):
num_classes = len(dataset.CLASSES)
self.assertIn(f'Number of categories: \t{num_classes}',
repr(dataset))
else:
self.assertIn('The `CLASSES` meta info is not set.', repr(dataset))
self.assertIn('Haven\'t been initialized', repr(dataset))
dataset.full_init()
@ -148,6 +146,30 @@ class TestCustomDataset(TestBaseDataset):
'gt_label': 1
}.items())
# test load without ann_file and without labels
# (no specific folder structures)
cfg = {
**self.DEFAULT_ARGS,
'data_prefix': ASSETS_ROOT,
'ann_file': '',
'with_label': False,
}
dataset = dataset_class(**cfg)
self.assertEqual(len(dataset), 4)
self.assertIsNone(dataset.CLASSES, None)
self.assertGreaterEqual(
dataset.get_data_info(0).items(), {
'img_path': osp.join(ASSETS_ROOT, '3.jpeg'),
}.items())
self.assertGreaterEqual(
dataset.get_data_info(1).items(), {
'img_path': osp.join(ASSETS_ROOT, 'a', '1.JPG'),
}.items())
self.assertGreaterEqual(
dataset.get_data_info(3).items(), {
'img_path': osp.join(ASSETS_ROOT, 'b', 'subb', '3.jpg'),
}.items())
# test ann_file assertion
cfg = {
**self.DEFAULT_ARGS,
@ -201,6 +223,27 @@ class TestCustomDataset(TestBaseDataset):
'gt_label': 1
}.items())
# test load with absolute ann_file and without label
cfg = {
**self.DEFAULT_ARGS,
'data_root': '',
'data_prefix': '',
'ann_file': osp.join(ASSETS_ROOT, 'ann_without_labels.txt'),
'with_label': False,
}
dataset = dataset_class(**cfg)
self.assertEqual(len(dataset), 3)
# custom dataset won't infer CLASSES from ann_file
self.assertIsNone(dataset.CLASSES, None)
self.assertGreaterEqual(
dataset.get_data_info(0).items(), {
'img_path': 'a/1.JPG',
}.items())
self.assertGreaterEqual(
dataset.get_data_info(2).items(), {
'img_path': 'b/subb/3.jpg',
}.items())
# test extensions filter
cfg = {
**self.DEFAULT_ARGS, 'data_prefix': dict(img_path=ASSETS_ROOT),
@ -302,6 +345,36 @@ class TestImageNet21k(TestCustomDataset):
self.assertIn('specify the `classes`', log.output[0])
class TestPlaces205(TestCustomDataset):
DATASET_TYPE = 'Places205'
DEFAULT_ARGS = dict(data_root=ASSETS_ROOT, ann_file='ann.txt')
def test_load_data_list(self):
dataset_class = DATASETS.get(self.DATASET_TYPE)
# test classes number
cfg = {
**self.DEFAULT_ARGS,
'data_prefix': ASSETS_ROOT,
'ann_file': '',
}
with self.assertRaisesRegex(AssertionError,
r"\(2\) doesn't match .* classes \(205\)"):
dataset_class(**cfg)
# test override classes
cfg = {
**self.DEFAULT_ARGS,
'data_prefix': ASSETS_ROOT,
'classes': ['cat', 'dog'],
'ann_file': '',
}
dataset = dataset_class(**cfg)
self.assertEqual(len(dataset), 3)
self.assertEqual(dataset.CLASSES, ('cat', 'dog'))
class TestCIFAR10(TestBaseDataset):
DATASET_TYPE = 'CIFAR10'