diff --git a/ppcls/data/dataloader/__init__.py b/ppcls/data/dataloader/__init__.py index e69de29bb..5d6fe91fc 100644 --- a/ppcls/data/dataloader/__init__.py +++ b/ppcls/data/dataloader/__init__.py @@ -0,0 +1,6 @@ +from ppcls.data.dataloader.imagenet_dataset import ImageNetDataset +from ppcls.data.dataloader.multilabel_dataset import MultiLabelDataset +from ppcls.data.dataloader.common_dataset import create_operators +from ppcls.data.dataloader.vehicle_dataset import CompCars, VeriWild +from ppcls.data.dataloader.logo_dataset import LogoDataset +from ppcls.data.dataloader.icartoon_dataset import ICartoonDataset diff --git a/ppcls/data/dataloader/mix_dataset.py b/ppcls/data/dataloader/mix_dataset.py new file mode 100644 index 000000000..c928ca26d --- /dev/null +++ b/ppcls/data/dataloader/mix_dataset.py @@ -0,0 +1,49 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import os + +from paddle.io import Dataset +from .. import dataloader + + +class MixDataset(Dataset): + def __init__(self, datasets_config): + super(MixDataset, self).__init__() + self.dataset_list = [] + start_idx = 0 + end_idx = 0 + for config_i in datasets_config: + dataset_name = config_i.pop('name') + dataset = getattr(dataloader, dataset_name)(**config_i) + end_idx += len(dataset) + self.dataset_list.append([end_idx, start_idx, dataset]) + start_idx = end_idx + + self.length = end_idx + + def __getitem__(self, idx): + for dataset_i in self.dataset_list: + if dataset_i[0] > idx: + dataset_i_idx = idx - dataset_i[1] + return dataset_i[2][dataset_i_idx] + + def __len__(self): + return self.length + + def get_dataset_list(self): + return self.dataset_list diff --git a/ppcls/data/dataloader/mix_sampler.py b/ppcls/data/dataloader/mix_sampler.py new file mode 100644 index 000000000..eaac66de0 --- /dev/null +++ b/ppcls/data/dataloader/mix_sampler.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division + +from paddle.io import DistributedBatchSampler, Sampler + +from ppcls.utils import logger +from ppcls.data.dataloader.mix_dataset import MixDataset +from ppcls.data import dataloader + + +class MixSampler(DistributedBatchSampler): + def __init__(self, dataset, batch_size, sample_configs, iter_per_epoch): + super(MixSampler, self).__init__(dataset, batch_size) + assert isinstance(dataset, MixDataset), "MixSampler only support MixDataset" + self.sampler_list = [] + self.batch_size = batch_size + self.start_list = [] + self.length = iter_per_epoch + dataset_list = dataset.get_dataset_list() + batch_size_left = self.batch_size + self.iter_list = [] + for i, config_i in enumerate(sample_configs): + sample_method = config_i.pop("name") + ratio_i = config_i.pop("ratio") + if i < len(sample_configs) - 1: + batch_size_i = self.batch_size * ratio_i + batch_size_left -= batch_size_i + else: + batch_size_i = batch_size_left + assert batch_size_i <= len(dataset_list[i][2]) + config_i["batch_size"] = batch_size_i + if sample_method == "DistributedBatchSampler": + sampler_i = DistributedBatchSampler(dataset_list[i][2], **config_i) + else: + sampler_i = getattr(dataloader, sample_method)(dataset_list[i][2], **config_i) + self.sampler_list.append(sampler_i) + self.iter_list.append(iter(sampler_i)) + self.length += len(dataset_list[i][2]) * ratio_i + self.iter_counter = 0 + + def __iter__(self): + while self.iter_counter < self.length: + batch = [] + for i, iter_i in enumerate(self.iter_list): + batch_i = next(iter_i, None) + if batch_i is None: + iter_i = iter(self.sampler_list[i]) + self.iter_list[i] = iter_i + batch_i = next(iter_i, None) + assert batch_i is not None, "dataset {} return None".format(i) + batch += [idx + self.start_list[i] for idx in batch_i] + if len(batch) == self.batch_size: + self.iter_counter += 1 + yield batch + else: + logger.info("Some dataset reaches end") + self.iter_counter = 0 + + def __len__(self): + return self.length