commit a99e6eb10d140ed09633df4b0f14684329841b3d Author: chenkai Date: Thu May 21 21:21:43 2020 +0800 init commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..7ecdf4cf --- /dev/null +++ b/.gitignore @@ -0,0 +1,118 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +**/*.pyc + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# custom +mmcls/version.py +data +.vscode +.idea +*.pkl +*.pkl.json +*.log.json +work_dirs/ + +# Pytorch +*.pth diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 00000000..84154c6c --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,34 @@ +image: registry.sensetime.com/eig-research/pytorch:1.3.1-cuda10.1-cudnn7-devel + +stages: + - linting + - test + +before_script: + - echo $PATH + - gcc --version + - python --version + - pip --version + - nvcc --version + - nvidia-smi + - python -c "import torch; print(torch.__version__)" + +linting: + stage: linting + script: + - pip install flake8 yapf isort + - flake8 . + - isort -rc --check-only --diff pytorch-template/ tools/ tests/ + - yapf -r -d pytorch-template/ tools/ tests/ configs/ + +test: + stage: test + script: + - echo "Start building..." + - pip install pillow==6.2.2 + - pip install -e . + - python -c "import mmcls; print(mmcls.__version__)" + # - echo "Start testing..." + # - pip install pytest coverage + # - coverage run --source mmcls -m pytest tests/ + # - coverage report -m diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..a16b75ae --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +repos: + - repo: https://gitlab.com/pycqa/flake8.git + rev: 3.8.0 + hooks: + - id: flake8 + - repo: https://github.com/asottile/seed-isort-config + rev: v2.1.0 + hooks: + - id: seed-isort-config + - repo: https://github.com/timothycrosley/isort + rev: 4.3.21 + hooks: + - id: isort + - repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.29.0 + hooks: + - id: yapf + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.5.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: fix-encoding-pragma + args: ["--remove"] diff --git a/README.md b/README.md new file mode 100644 index 00000000..3d3c18d6 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# MMClassification diff --git a/configs/example.py b/configs/example.py new file mode 100644 index 00000000..912d495a --- /dev/null +++ b/configs/example.py @@ -0,0 +1,59 @@ +# model settings +model = dict( + type='xxx', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch')) +# dataset settings +dataset_type = 'XXXDataset' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [] +test_pipeline = [] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file='', + data_prefix='', + pipeline=train_pipeline), + val=dict( + type=dataset_type, ann_file='', data_prefix='', + pipeline=test_pipeline), + test=dict( + type=dataset_type, ann_file='', data_prefix='', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +# checkpoint saving +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/xxx' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/mmcls/VERSION b/mmcls/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/mmcls/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/mmcls/__init__.py b/mmcls/__init__.py new file mode 100644 index 00000000..1c4f7e8f --- /dev/null +++ b/mmcls/__init__.py @@ -0,0 +1,3 @@ +from .version import __version__, short_version + +__all__ = ['__version__', 'short_version'] diff --git a/mmcls/apis/__init__.py b/mmcls/apis/__init__.py new file mode 100644 index 00000000..a7ba81c5 --- /dev/null +++ b/mmcls/apis/__init__.py @@ -0,0 +1,8 @@ +from .inference import inference_model, init_model +from .test import multi_gpu_test, single_gpu_test +from .train import set_random_seed, train_model + +__all__ = [ + 'set_random_seed', 'train_model', 'init_model', 'inference_model', + 'multi_gpu_test', 'single_gpu_test' +] diff --git a/mmcls/apis/inference.py b/mmcls/apis/inference.py new file mode 100644 index 00000000..dfc64e8d --- /dev/null +++ b/mmcls/apis/inference.py @@ -0,0 +1,6 @@ +def init_model(): + pass + + +def inference_model(): + pass diff --git a/mmcls/apis/test.py b/mmcls/apis/test.py new file mode 100644 index 00000000..b71e1b2b --- /dev/null +++ b/mmcls/apis/test.py @@ -0,0 +1,152 @@ +import os.path as osp +import pickle +import shutil +import tempfile + +import mmcv +import torch +import torch.distributed as dist +from mmcv.runner import get_dist_info + + +def single_gpu_test(model, + data_loader, + show=False, + out_dir=None, + show_score_thr=0.3): + model.eval() + results = [] + dataset = data_loader.dataset + prog_bar = mmcv.ProgressBar(len(dataset)) + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + results.append(result) + + if show or out_dir: + pass # TODO + + batch_size = data['img'][0].size(0) + for _ in range(batch_size): + prog_bar.update() + return results + + +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + results.append(result) + + if rank == 0: + batch_size = ( + len(data['img_meta']._data) + if 'img_meta' in data else data['img'][0].size(0)) + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results + + +def collect_results_cpu(result_part, size, tmpdir=None): + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + tmpdir = tempfile.mkdtemp() + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_list.append( + pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results diff --git a/mmcls/apis/train.py b/mmcls/apis/train.py new file mode 100644 index 00000000..8e73a053 --- /dev/null +++ b/mmcls/apis/train.py @@ -0,0 +1,162 @@ +import random +from collections import OrderedDict + +import numpy as np +import torch +import torch.distributed as dist +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import DistSamplerSeedHook, Runner + +from mmcls.core import (DistEvalHook, DistOptimizerHook, EvalHook, + Fp16OptimizerHook, build_optimizer) +from mmcls.datasets import build_dataloader, build_dataset +from mmcls.utils import get_root_logger + + +def set_random_seed(seed, deterministic=False): + """Set random seed. + Args: + seed (int): Seed to be used. + deterministic (bool): Whether to set the deterministic option for + CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` + to True and `torch.backends.cudnn.benchmark` to False. + Default: False. + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if deterministic: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def parse_losses(losses): + log_vars = OrderedDict() + for loss_name, loss_value in losses.items(): + if isinstance(loss_value, torch.Tensor): + log_vars[loss_name] = loss_value.mean() + elif isinstance(loss_value, list): + log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) + else: + raise TypeError(f'{loss_name} is not a tensor or list of tensors') + + loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) + + log_vars['loss'] = loss + for loss_name, loss_value in log_vars.items(): + # reduce loss when distributed training + if dist.is_available() and dist.is_initialized(): + loss_value = loss_value.data.clone() + dist.all_reduce(loss_value.div_(dist.get_world_size())) + log_vars[loss_name] = loss_value.item() + + return loss, log_vars + + +def batch_processor(model, data, train_mode): + """Process a data batch. + This method is required as an argument of Runner, which defines how to + process a data batch and obtain proper outputs. The first 3 arguments of + batch_processor are fixed. + Args: + model (nn.Module): A PyTorch model. + data (dict): The data batch in a dict. + train_mode (bool): Training mode or not. It may be useless for some + models. + Returns: + dict: A dict containing losses and log vars. + """ + losses = model(**data) + loss, log_vars = parse_losses(losses) + + outputs = dict( + loss=loss, log_vars=log_vars, num_samples=len(data['img'].data)) + + return outputs + + +def train_model(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + logger = get_root_logger(cfg.log_level) + + # prepare data loaders + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + + data_loaders = [ + build_dataloader( + ds, + cfg.data.samples_per_gpu, + cfg.data.workers_per_gpu, + # cfg.gpus will be ignored if distributed + len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed) for ds in dataset + ] + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = MMDataParallel( + model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + runner = Runner( + model, + batch_processor, + optimizer, + cfg.work_dir, + logger=logger, + meta=meta) + # an ugly walkaround to make the .log and .log.json filenames the same + runner.timestamp = timestamp + + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=distributed) + elif distributed and 'type' not in cfg.optimizer_config: + optimizer_config = DistOptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks(cfg.lr_config, optimizer_config, + cfg.checkpoint_config, cfg.log_config, + cfg.get('momentum_config', None)) + if distributed: + runner.register_hook(DistSamplerSeedHook()) + + # register eval hooks + if validate: + val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) + val_dataloader = build_dataloader( + val_dataset, + samples_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False) + eval_cfg = cfg.get('evaluation', {}) + eval_hook = DistEvalHook if distributed else EvalHook + runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow, cfg.total_epochs) diff --git a/mmcls/core/__init__.py b/mmcls/core/__init__.py new file mode 100644 index 00000000..ee0dac43 --- /dev/null +++ b/mmcls/core/__init__.py @@ -0,0 +1,3 @@ +from .evaluation import * # noqa: F401, F403 +from .fp16 import * # noqa: F401, F403 +from .utils import * # noqa: F401, F403 diff --git a/mmcls/core/evaluation/__init__.py b/mmcls/core/evaluation/__init__.py new file mode 100644 index 00000000..3dca68df --- /dev/null +++ b/mmcls/core/evaluation/__init__.py @@ -0,0 +1,3 @@ +from .eval_hooks import EvalHook + +__all__ = ['EvalHook'] diff --git a/mmcls/core/evaluation/eval_hooks.py b/mmcls/core/evaluation/eval_hooks.py new file mode 100644 index 00000000..c2e7aecd --- /dev/null +++ b/mmcls/core/evaluation/eval_hooks.py @@ -0,0 +1,74 @@ +import os.path as osp + +from mmcv.runner import Hook +from torch.utils.data import DataLoader + + +class EvalHook(Hook): + """Evaluation hook. + + Args: + dataloader (DataLoader): A PyTorch dataloader. + interval (int): Evaluation interval (by epochs). Default: 1. + """ + + def __init__(self, dataloader, interval=1, **eval_kwargs): + if not isinstance(dataloader, DataLoader): + raise TypeError('dataloader must be a pytorch DataLoader, but got' + f' {type(dataloader)}') + self.dataloader = dataloader + self.interval = interval + self.eval_kwargs = eval_kwargs + + def after_train_epoch(self, runner): + if not self.every_n_epochs(runner, self.interval): + return + from mmcls.apis import single_gpu_test + results = single_gpu_test(runner.model, self.dataloader, show=False) + self.evaluate(runner, results) + + def evaluate(self, runner, results): + eval_res = self.dataloader.dataset.evaluate( + results, logger=runner.logger, **self.eval_kwargs) + for name, val in eval_res.items(): + runner.log_buffer.output[name] = val + runner.log_buffer.ready = True + + +class DistEvalHook(EvalHook): + """Distributed evaluation hook. + + Args: + dataloader (DataLoader): A PyTorch dataloader. + interval (int): Evaluation interval (by epochs). Default: 1. + tmpdir (str | None): Temporary directory to save the results of all + processes. Default: None. + gpu_collect (bool): Whether to use gpu or cpu to collect results. + Default: False. + """ + + def __init__(self, + dataloader, + interval=1, + gpu_collect=False, + **eval_kwargs): + if not isinstance(dataloader, DataLoader): + raise TypeError('dataloader must be a pytorch DataLoader, but got ' + f'{type(dataloader)}') + self.dataloader = dataloader + self.interval = interval + self.gpu_collect = gpu_collect + self.eval_kwargs = eval_kwargs + + def after_train_epoch(self, runner): + if not self.every_n_epochs(runner, self.interval): + return + from mmcls.apis import multi_gpu_test + results = multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=osp.join(runner.work_dir, '.eval_hook'), + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + self.evaluate(runner, results) diff --git a/mmcls/core/fp16/__init__.py b/mmcls/core/fp16/__init__.py new file mode 100644 index 00000000..cc655b7c --- /dev/null +++ b/mmcls/core/fp16/__init__.py @@ -0,0 +1,4 @@ +from .decorators import auto_fp16, force_fp32 +from .hooks import Fp16OptimizerHook, wrap_fp16_model + +__all__ = ['auto_fp16', 'force_fp32', 'Fp16OptimizerHook', 'wrap_fp16_model'] diff --git a/mmcls/core/fp16/decorators.py b/mmcls/core/fp16/decorators.py new file mode 100644 index 00000000..10ffbf89 --- /dev/null +++ b/mmcls/core/fp16/decorators.py @@ -0,0 +1,160 @@ +import functools +from inspect import getfullargspec + +import torch + +from .utils import cast_tensor_type + + +def auto_fp16(apply_to=None, out_fp32=False): + """Decorator to enable fp16 training automatically. + + This decorator is useful when you write custom modules and want to support + mixed precision training. If inputs arguments are fp32 tensors, they will + be converted to fp16 automatically. Arguments other than fp32 tensors are + ignored. + + Args: + apply_to (Iterable, optional): The argument names to be converted. + `None` indicates all arguments. + out_fp32 (bool): Whether to convert the output back to fp32. + + :Example: + + class MyModule1(nn.Module) + + # Convert x and y to fp16 + @auto_fp16() + def forward(self, x, y): + pass + + class MyModule2(nn.Module): + + # convert pred to fp16 + @auto_fp16(apply_to=('pred', )) + def do_something(self, pred, others): + pass + """ + + def auto_fp16_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # check if the module has set the attribute `fp16_enabled`, if not, + # just fallback to the original method. + if not isinstance(args[0], torch.nn.Module): + raise TypeError('@auto_fp16 can only be used to decorate the ' + 'method of nn.Module') + if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): + return old_func(*args, **kwargs) + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get the argument names to be casted + args_to_cast = args_info.args if apply_to is None else apply_to + # convert the args that need to be processed + new_args = [] + # NOTE: default args are not taken into consideration + if args: + arg_names = args_info.args[:len(args)] + for i, arg_name in enumerate(arg_names): + if arg_name in args_to_cast: + new_args.append( + cast_tensor_type(args[i], torch.float, torch.half)) + else: + new_args.append(args[i]) + # convert the kwargs that need to be processed + new_kwargs = {} + if kwargs: + for arg_name, arg_value in kwargs.items(): + if arg_name in args_to_cast: + new_kwargs[arg_name] = cast_tensor_type( + arg_value, torch.float, torch.half) + else: + new_kwargs[arg_name] = arg_value + # apply converted arguments to the decorated method + output = old_func(*new_args, **new_kwargs) + # cast the results back to fp32 if necessary + if out_fp32: + output = cast_tensor_type(output, torch.half, torch.float) + return output + + return new_func + + return auto_fp16_wrapper + + +def force_fp32(apply_to=None, out_fp16=False): + """Decorator to convert input arguments to fp32 in force. + + This decorator is useful when you write custom modules and want to support + mixed precision training. If there are some inputs that must be processed + in fp32 mode, then this decorator can handle it. If inputs arguments are + fp16 tensors, they will be converted to fp32 automatically. Arguments other + than fp16 tensors are ignored. + + Args: + apply_to (Iterable, optional): The argument names to be converted. + `None` indicates all arguments. + out_fp16 (bool): Whether to convert the output back to fp16. + + :Example: + + class MyModule1(nn.Module) + + # Convert x and y to fp32 + @force_fp32() + def loss(self, x, y): + pass + + class MyModule2(nn.Module): + + # convert pred to fp32 + @force_fp32(apply_to=('pred', )) + def post_process(self, pred, others): + pass + """ + + def force_fp32_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # check if the module has set the attribute `fp16_enabled`, if not, + # just fallback to the original method. + if not isinstance(args[0], torch.nn.Module): + raise TypeError('@force_fp32 can only be used to decorate the ' + 'method of nn.Module') + if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): + return old_func(*args, **kwargs) + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get the argument names to be casted + args_to_cast = args_info.args if apply_to is None else apply_to + # convert the args that need to be processed + new_args = [] + if args: + arg_names = args_info.args[:len(args)] + for i, arg_name in enumerate(arg_names): + if arg_name in args_to_cast: + new_args.append( + cast_tensor_type(args[i], torch.half, torch.float)) + else: + new_args.append(args[i]) + # convert the kwargs that need to be processed + new_kwargs = dict() + if kwargs: + for arg_name, arg_value in kwargs.items(): + if arg_name in args_to_cast: + new_kwargs[arg_name] = cast_tensor_type( + arg_value, torch.half, torch.float) + else: + new_kwargs[arg_name] = arg_value + # apply converted arguments to the decorated method + output = old_func(*new_args, **new_kwargs) + # cast the results back to fp32 if necessary + if out_fp16: + output = cast_tensor_type(output, torch.float, torch.half) + return output + + return new_func + + return force_fp32_wrapper diff --git a/mmcls/core/fp16/hooks.py b/mmcls/core/fp16/hooks.py new file mode 100644 index 00000000..be2a921c --- /dev/null +++ b/mmcls/core/fp16/hooks.py @@ -0,0 +1,127 @@ +import copy + +import torch +import torch.nn as nn +from mmcv.runner import OptimizerHook + +from ..dist_utils import allreduce_grads +from .utils import cast_tensor_type + + +class Fp16OptimizerHook(OptimizerHook): + """FP16 optimizer hook. + + The steps of fp16 optimizer is as follows. + 1. Scale the loss value. + 2. BP in the fp16 model. + 2. Copy gradients from fp16 model to fp32 weights. + 3. Update fp32 weights. + 4. Copy updated parameters from fp32 weights to fp16 model. + + Refer to https://arxiv.org/abs/1710.03740 for more details. + + Args: + loss_scale (float): Scale factor multiplied with loss. + """ + + def __init__(self, + grad_clip=None, + coalesce=True, + bucket_size_mb=-1, + loss_scale=512., + distributed=True): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + self.loss_scale = loss_scale + self.distributed = distributed + + def before_run(self, runner): + # keep a copy of fp32 weights + runner.optimizer.param_groups = copy.deepcopy( + runner.optimizer.param_groups) + # convert model to fp16 + wrap_fp16_model(runner.model) + + def copy_grads_to_fp32(self, fp16_net, fp32_weights): + """Copy gradients from fp16 model to fp32 weight copy.""" + for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()): + if fp16_param.grad is not None: + if fp32_param.grad is None: + fp32_param.grad = fp32_param.data.new(fp32_param.size()) + fp32_param.grad.copy_(fp16_param.grad) + + def copy_params_to_fp16(self, fp16_net, fp32_weights): + """Copy updated params from fp32 weight copy to fp16 model.""" + for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights): + fp16_param.data.copy_(fp32_param.data) + + def after_train_iter(self, runner): + # clear grads of last iteration + runner.model.zero_grad() + runner.optimizer.zero_grad() + # scale the loss value + scaled_loss = runner.outputs['loss'] * self.loss_scale + scaled_loss.backward() + # copy fp16 grads in the model to fp32 params in the optimizer + fp32_weights = [] + for param_group in runner.optimizer.param_groups: + fp32_weights += param_group['params'] + self.copy_grads_to_fp32(runner.model, fp32_weights) + # allreduce grads + if self.distributed: + allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb) + # scale the gradients back + for param in fp32_weights: + if param.grad is not None: + param.grad.div_(self.loss_scale) + if self.grad_clip is not None: + self.clip_grads(fp32_weights) + # update fp32 params + runner.optimizer.step() + # copy fp32 params to the fp16 model + self.copy_params_to_fp16(runner.model, fp32_weights) + + +def wrap_fp16_model(model): + # convert model to fp16 + model.half() + # patch the normalization layers to make it work in fp32 mode + patch_norm_fp32(model) + # set `fp16_enabled` flag + for m in model.modules(): + if hasattr(m, 'fp16_enabled'): + m.fp16_enabled = True + + +def patch_norm_fp32(module): + if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)): + module.float() + module.forward = patch_forward_method(module.forward, torch.half, + torch.float) + for child in module.children(): + patch_norm_fp32(child) + return module + + +def patch_forward_method(func, src_type, dst_type, convert_output=True): + """Patch the forward method of a module. + + Args: + func (callable): The original forward method. + src_type (torch.dtype): Type of input arguments to be converted from. + dst_type (torch.dtype): Type of input arguments to be converted to. + convert_output (bool): Whether to convert the output back to src_type. + + Returns: + callable: The patched forward method. + """ + + def new_forward(*args, **kwargs): + output = func(*cast_tensor_type(args, src_type, dst_type), + **cast_tensor_type(kwargs, src_type, dst_type)) + if convert_output: + output = cast_tensor_type(output, dst_type, src_type) + return output + + return new_forward diff --git a/mmcls/core/fp16/utils.py b/mmcls/core/fp16/utils.py new file mode 100644 index 00000000..ce691c79 --- /dev/null +++ b/mmcls/core/fp16/utils.py @@ -0,0 +1,23 @@ +from collections import abc + +import numpy as np +import torch + + +def cast_tensor_type(inputs, src_type, dst_type): + if isinstance(inputs, torch.Tensor): + return inputs.to(dst_type) + elif isinstance(inputs, str): + return inputs + elif isinstance(inputs, np.ndarray): + return inputs + elif isinstance(inputs, abc.Mapping): + return type(inputs)({ + k: cast_tensor_type(v, src_type, dst_type) + for k, v in inputs.items() + }) + elif isinstance(inputs, abc.Iterable): + return type(inputs)( + cast_tensor_type(item, src_type, dst_type) for item in inputs) + else: + return inputs diff --git a/mmcls/core/utils/__init__.py b/mmcls/core/utils/__init__.py new file mode 100644 index 00000000..537c268b --- /dev/null +++ b/mmcls/core/utils/__init__.py @@ -0,0 +1,3 @@ +from .dist_utils import DistOptimizerHook + +__all__ = ['DistOptimizerHook'] diff --git a/mmcls/core/utils/dist_utils.py b/mmcls/core/utils/dist_utils.py new file mode 100644 index 00000000..e8c1b7c9 --- /dev/null +++ b/mmcls/core/utils/dist_utils.py @@ -0,0 +1,16 @@ +from mmcv.runner import OptimizerHook + + +class DistOptimizerHook(OptimizerHook): + + def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + + def after_train_iter(self, runner): + runner.optimizer.zero_grad() + runner.outputs['loss'].backward() + if self.grad_clip is not None: + self.clip_grads(runner.model.parameters()) + runner.optimizer.step() diff --git a/mmcls/datasets/__init__.py b/mmcls/datasets/__init__.py new file mode 100644 index 00000000..fc29cc91 --- /dev/null +++ b/mmcls/datasets/__init__.py @@ -0,0 +1,9 @@ +from .base_dataset import BaseDataset +from .builder import build_dataloader, build_dataset +from .pipelines import Compose +from .samplers import DistributedSampler + +__all__ = [ + 'BaseDataset', 'build_dataloader', 'build_dataset', 'Compose', + 'DistributedSampler' +] diff --git a/mmcls/datasets/base_dataset.py b/mmcls/datasets/base_dataset.py new file mode 100644 index 00000000..826f32a7 --- /dev/null +++ b/mmcls/datasets/base_dataset.py @@ -0,0 +1,39 @@ +import copy +from abc import ABCMeta, abstractmethod + +from torch.utils.data import Dataset + +from .pipelines import Compose + + +class BaseDataset(Dataset, metaclass=ABCMeta): + + def __init__(self, ann_file, pipeline, data_prefix, test_mode): + super(BaseDataset, self).__init__() + + self.ann_file = ann_file + self.data_prefix = data_prefix + self.test_mode = test_mode + self.pipeline = Compose(pipeline) + self.data_infos = self.load_annotations() + + @abstractmethod + def load_annotations(self): + pass + + def prepare_train_data(self, idx): + results = copy.deepcopy(self.data_infos[idx]) + return self.pipeline(results) + + def prepare_test_data(self, idx): + results = copy.deepcopy(self.data_infos[idx]) + return self.pipeline(results) + + def __len__(self): + return len(self.data_infos) + + def __getitem__(self, idx): + if self.test_mode: + return self.prepare_train_data(idx) + else: + return self.prepare_test_data(idx) diff --git a/mmcls/datasets/builder.py b/mmcls/datasets/builder.py new file mode 100644 index 00000000..d0c7a76e --- /dev/null +++ b/mmcls/datasets/builder.py @@ -0,0 +1,100 @@ +import platform +import random +from functools import partial + +import numpy as np +from mmcv.parallel import collate +from mmcv.runner import get_dist_info +from mmcv.utils import Registry, build_from_cfg +from torch.utils.data import DataLoader + +from .samplers import DistributedSampler + +if platform.system() != 'Windows': + # https://github.com/pytorch/pytorch/issues/973 + import resource + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + hard_limit = rlimit[1] + soft_limit = min(4096, hard_limit) + resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) + +DATASETS = Registry('dataset') +PIPELINES = Registry('pipeline') + + +def build_dataset(cfg, default_args=None): + from .dataset_wrappers import ConcatDataset, RepeatDataset + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) + elif cfg['type'] == 'RepeatDataset': + dataset = RepeatDataset( + build_dataset(cfg['dataset'], default_args), cfg['times']) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset + + +def build_dataloader(dataset, + samples_per_gpu, + workers_per_gpu, + num_gpus=1, + dist=True, + shuffle=True, + seed=None, + **kwargs): + """Build PyTorch DataLoader. + + In distributed training, each GPU/process has a dataloader. + In non-distributed training, there is only one dataloader for all GPUs. + + Args: + dataset (Dataset): A PyTorch dataset. + samples_per_gpu (int): Number of training samples on each GPU, i.e., + batch size of each GPU. + workers_per_gpu (int): How many subprocesses to use for data loading + for each GPU. + num_gpus (int): Number of GPUs. Only used in non-distributed training. + dist (bool): Distributed training/test or not. Default: True. + shuffle (bool): Whether to shuffle the data at every epoch. + Default: True. + kwargs: any keyword argument to be used to initialize DataLoader + + Returns: + DataLoader: A PyTorch dataloader. + """ + rank, world_size = get_dist_info() + if dist: + sampler = DistributedSampler( + dataset, world_size, rank, shuffle=shuffle) + shuffle = False + batch_size = samples_per_gpu + num_workers = workers_per_gpu + else: + sampler = None + batch_size = num_gpus * samples_per_gpu + num_workers = num_gpus * workers_per_gpu + + init_fn = partial( + worker_init_fn, num_workers=num_workers, rank=rank, + seed=seed) if seed is not None else None + + data_loader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + shuffle=shuffle, + worker_init_fn=init_fn, + **kwargs) + + return data_loader + + +def worker_init_fn(worker_id, num_workers, rank, seed): + # The seed of each worker equals to + # num_worker * rank + worker_id + user_seed + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) diff --git a/mmcls/datasets/pipelines/__init__.py b/mmcls/datasets/pipelines/__init__.py new file mode 100644 index 00000000..b9b47ae9 --- /dev/null +++ b/mmcls/datasets/pipelines/__init__.py @@ -0,0 +1,3 @@ +from .compose import Compose + +__all__ = ['Compose'] diff --git a/mmcls/datasets/pipelines/compose.py b/mmcls/datasets/pipelines/compose.py new file mode 100644 index 00000000..ef35ce2a --- /dev/null +++ b/mmcls/datasets/pipelines/compose.py @@ -0,0 +1,42 @@ +from collections.abc import Sequence + +from mmcv.utils import build_from_cfg + +from ..registry import PIPELINES + + +@PIPELINES.register_module +class Compose(object): + """Compose a data pipeline with a sequence of transforms. + + Args: + transforms (list[dict | callable]): + Either config dicts of transforms or transform objects. + """ + + def __init__(self, transforms): + assert isinstance(transforms, Sequence) + self.transforms = [] + for transform in transforms: + if isinstance(transform, dict): + transform = build_from_cfg(transform, PIPELINES) + self.transforms.append(transform) + elif callable(transform): + self.transforms.append(transform) + else: + raise TypeError('transform must be callable or a dict, but got' + f' {type(transform)}') + + def __call__(self, data): + for t in self.transforms: + data = t(data) + if data is None: + return None + return data + + def __repr__(self): + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + format_string += f'\n {t}' + format_string += '\n)' + return format_string diff --git a/mmcls/datasets/samplers/__init__.py b/mmcls/datasets/samplers/__init__.py new file mode 100644 index 00000000..cffe4dcb --- /dev/null +++ b/mmcls/datasets/samplers/__init__.py @@ -0,0 +1,3 @@ +from .distributed_sampler import DistributedSampler + +__all__ = ['DistributedSampler'] diff --git a/mmcls/datasets/samplers/distributed_sampler.py b/mmcls/datasets/samplers/distributed_sampler.py new file mode 100644 index 00000000..2a85619c --- /dev/null +++ b/mmcls/datasets/samplers/distributed_sampler.py @@ -0,0 +1,28 @@ +import torch +from torch.utils.data import DistributedSampler as _DistributedSampler + + +class DistributedSampler(_DistributedSampler): + + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + super().__init__(dataset, num_replicas=num_replicas, rank=rank) + self.shuffle = shuffle + + def __iter__(self): + # deterministically shuffle based on epoch + if self.shuffle: + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank:self.total_size:self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) diff --git a/mmcls/models/__init__.py b/mmcls/models/__init__.py new file mode 100644 index 00000000..eab103a0 --- /dev/null +++ b/mmcls/models/__init__.py @@ -0,0 +1,4 @@ +from .builder import build_model +from .registry import MODELS + +__all__ = ['build_model', 'MODELS'] diff --git a/mmcls/models/backbones/__init__.py b/mmcls/models/backbones/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mmcls/models/builder.py b/mmcls/models/builder.py new file mode 100644 index 00000000..34955208 --- /dev/null +++ b/mmcls/models/builder.py @@ -0,0 +1,18 @@ +import torch.nn as nn +from mmcv.utils import Registry, build_from_cfg + +MODELS = Registry('model') + + +def build(cfg, registry, default_args=None): + if isinstance(cfg, list): + modules = [ + build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg + ] + return nn.Sequential(*modules) + else: + return build_from_cfg(cfg, registry, default_args) + + +def build_model(cfg, train_cfg=None, test_cfg=None): + return build(cfg, MODELS, dict(train_cfg=train_cfg, test_cfg=test_cfg)) diff --git a/mmcls/models/registry.py b/mmcls/models/registry.py new file mode 100644 index 00000000..fe015f6f --- /dev/null +++ b/mmcls/models/registry.py @@ -0,0 +1,3 @@ +from mmcv.utils import Registry + +MODELS = Registry('model') diff --git a/mmcls/utils/__init__.py b/mmcls/utils/__init__.py new file mode 100644 index 00000000..bda2ff83 --- /dev/null +++ b/mmcls/utils/__init__.py @@ -0,0 +1,4 @@ +from .collect_env import collect_env +from .logger import get_root_logger + +__all__ = ['collect_env', 'get_root_logger'] diff --git a/mmcls/utils/collect_env.py b/mmcls/utils/collect_env.py new file mode 100644 index 00000000..ca71a2e4 --- /dev/null +++ b/mmcls/utils/collect_env.py @@ -0,0 +1,61 @@ +import os.path as osp +import subprocess +import sys +from collections import defaultdict + +import cv2 +import mmcv +import torch +import torchvision + +import mmcls + + +def collect_env(): + env_info = {} + env_info['sys.platform'] = sys.platform + env_info['Python'] = sys.version.replace('\n', '') + + cuda_available = torch.cuda.is_available() + env_info['CUDA available'] = cuda_available + + if cuda_available: + from torch.utils.cpp_extension import CUDA_HOME + env_info['CUDA_HOME'] = CUDA_HOME + + if CUDA_HOME is not None and osp.isdir(CUDA_HOME): + try: + nvcc = osp.join(CUDA_HOME, 'bin/nvcc') + nvcc = subprocess.check_output( + f'"{nvcc}" -V | tail -n1', shell=True) + nvcc = nvcc.decode('utf-8').strip() + except subprocess.SubprocessError: + nvcc = 'Not Available' + env_info['NVCC'] = nvcc + + devices = defaultdict(list) + for k in range(torch.cuda.device_count()): + devices[torch.cuda.get_device_name(k)].append(str(k)) + for name, devids in devices.items(): + env_info['GPU ' + ','.join(devids)] = name + + gcc = subprocess.check_output('gcc --version | head -n1', shell=True) + gcc = gcc.decode('utf-8').strip() + env_info['GCC'] = gcc + + env_info['PyTorch'] = torch.__version__ + env_info['PyTorch compiling details'] = torch.__config__.show() + + env_info['TorchVision'] = torchvision.__version__ + + env_info['OpenCV'] = cv2.__version__ + + env_info['MMCV'] = mmcv.__version__ + env_info['mmcls'] = mmcls.__version__ + + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print(f'{name}: {val}') diff --git a/mmcls/utils/logger.py b/mmcls/utils/logger.py new file mode 100644 index 00000000..22f9cf50 --- /dev/null +++ b/mmcls/utils/logger.py @@ -0,0 +1,7 @@ +import logging + +from mmcv.utils import get_logger + + +def get_root_logger(log_file=None, log_level=logging.INFO): + return get_logger('mmcls', log_file, log_level) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..e4ff4f78 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +mmcv>=0.3.0 +numpy +torch>=1.1 +torchvision diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..73d5aca3 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,22 @@ +[bdist_wheel] +universal=1 + +[aliases] +test=pytest + +[tool:pytest] +addopts=tests/ + +[yapf] +based_on_style = pep8 +blank_line_before_nested_class_or_def = true +split_before_expression_after_opening_paren = true + +[isort] +line_length = 79 +multi_line_output = 0 +known_standard_library = pkg_resources,setuptools +known_first_party = mmcls +known_third_party = cv2,mmcv,numpy,torch,torchvision +no_lines_before = STDLIB,LOCALFOLDER +default_section = THIRDPARTY diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..b87c227c --- /dev/null +++ b/setup.py @@ -0,0 +1,111 @@ +import os +import subprocess +import time +from setuptools import find_packages, setup + + +def readme(): + with open('README.md', encoding='utf-8') as f: + content = f.read() + return content + + +version_file = 'mmcls/version.py' + + +def get_git_hash(): + + def _minimal_ext_cmd(cmd): + # construct minimal environment + env = {} + for k in ['SYSTEMROOT', 'PATH', 'HOME']: + v = os.environ.get(k) + if v is not None: + env[k] = v + # LANGUAGE is used on win32 + env['LANGUAGE'] = 'C' + env['LANG'] = 'C' + env['LC_ALL'] = 'C' + out = subprocess.Popen( + cmd, stdout=subprocess.PIPE, env=env).communicate()[0] + return out + + try: + out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) + sha = out.strip().decode('ascii') + except OSError: + sha = 'unknown' + + return sha + + +def get_hash(): + if os.path.exists('.git'): + sha = get_git_hash()[:7] + elif os.path.exists(version_file): + try: + from mmcls.version import __version__ + sha = __version__.split('+')[-1] + except ImportError: + raise ImportError('Unable to get git version') + else: + sha = 'unknown' + + return sha + + +def write_version_py(): + content = """# GENERATED VERSION FILE +# TIME: {} +__version__ = '{}' +short_version = '{}' +version_info = ({}) +""" + sha = get_hash() + with open('mmcls/VERSION', 'r') as f: + SHORT_VERSION = f.read().strip() + VERSION_INFO = ', '.join(SHORT_VERSION.split('.')) + VERSION = SHORT_VERSION + '+' + sha + + version_file_str = content.format(time.asctime(), VERSION, SHORT_VERSION, + VERSION_INFO) + with open(version_file, 'w') as f: + f.write(version_file_str) + + +def get_version(): + with open(version_file, 'r') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +def get_requirements(filename='requirements.txt'): + here = os.path.dirname(os.path.realpath(__file__)) + with open(os.path.join(here, filename), 'r') as f: + requires = [line.replace('\n', '') for line in f.readlines()] + return requires + + +if __name__ == '__main__': + write_version_py() + setup( + name='mmcls', + version=get_version(), + description='A template for pytorch projects.', + long_description=readme(), + packages=find_packages(exclude=('configs', 'tools', 'demo')), + package_data={'mmcls.ops': ['*/*.so']}, + classifiers=[ + 'Development Status :: 4 - Beta', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + ], + license='Apache License 2.0', + setup_requires=['pytest-runner', 'cython', 'numpy'], + tests_require=['pytest', 'xdoctest'], + install_requires=get_requirements(), + zip_safe=False) diff --git a/tools/dist_test.sh b/tools/dist_test.sh new file mode 100644 index 00000000..3c74ec6e --- /dev/null +++ b/tools/dist_test.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +PORT=${PORT:-29500} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} diff --git a/tools/dist_train.sh b/tools/dist_train.sh new file mode 100644 index 00000000..5b43fffb --- /dev/null +++ b/tools/dist_train.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CONFIG=$1 +GPUS=$2 +PORT=${PORT:-29500} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} diff --git a/tools/slurm_test.sh b/tools/slurm_test.sh new file mode 100644 index 00000000..6dd67e57 --- /dev/null +++ b/tools/slurm_test.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +CHECKPOINT=$4 +GPUS=${GPUS:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +PY_ARGS=${@:5} +SRUN_ARGS=${SRUN_ARGS:-""} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} diff --git a/tools/slurm_train.sh b/tools/slurm_train.sh new file mode 100644 index 00000000..b3feb3d9 --- /dev/null +++ b/tools/slurm_train.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +WORK_DIR=$4 +GPUS=${GPUS:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +SRUN_ARGS=${SRUN_ARGS:-""} +PY_ARGS=${@:5} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} diff --git a/tools/test.py b/tools/test.py new file mode 100644 index 00000000..74b3afe9 --- /dev/null +++ b/tools/test.py @@ -0,0 +1,94 @@ +import argparse +import os + +import mmcv +import torch +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import get_dist_info, init_dist, load_checkpoint + +from mmcls.core import multi_gpu_test, single_gpu_test, wrap_fp16_model +from mmcls.datasets import build_dataloader, build_dataset +from mmcls.models import build_model + + +def parse_args(): + parser = argparse.ArgumentParser(description='mmcls test model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('--out', help='output result file') + parser.add_argument( + '--eval', + type=str, + nargs='+', + choices=['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'], + help='eval types') + parser.add_argument( + '--gpu_collect', + action='store_true', + help='whether to use gpu to collect results') + parser.add_argument('--tmpdir', help='tmp dir for writing some results') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def main(): + args = parse_args() + + cfg = mmcv.Config.fromfile(args.config) + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + cfg.model.pretrained = None + cfg.data.test.test_mode = True + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + + # build the dataloader + # TODO: support multiple images per gpu (only minor changes are needed) + dataset = build_dataset(cfg.data.test) + data_loader = build_dataloader( + dataset, + imgs_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False) + + # build the model and load checkpoint + model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + _ = load_checkpoint(model, args.checkpoint, map_location='cpu') + + if not distributed: + model = MMDataParallel(model, device_ids=[0]) + outputs = single_gpu_test(model, data_loader) + else: + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False) + outputs = multi_gpu_test(model, data_loader, args.tmpdir, + args.gpu_collect) + + rank, _ = get_dist_info() + if args.out and rank == 0: + print(f'\nwriting results to {args.out}') + mmcv.dump(outputs, args.out) + + +if __name__ == '__main__': + main() diff --git a/tools/train.py b/tools/train.py new file mode 100644 index 00000000..9b477b43 --- /dev/null +++ b/tools/train.py @@ -0,0 +1,159 @@ +import argparse +import copy +import os +import os.path as osp +import time + +import mmcv +import torch +from mmcv import Config, DictAction +from mmcv.runner import init_dist + +from mmcls import __version__ +from mmcls.apis import set_random_seed, train_model +from mmcls.datasets import build_dataset +from mmcls.models import build_model +from mmcls.utils import collect_env, get_root_logger + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a model') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--gpus', + type=int, + help='number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='ids of gpus to use ' + '(only applicable to non-distributed training)') + parser.add_argument('--seed', type=int, default=None, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--options', nargs='+', action=DictAction, help='arguments in dict') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument( + '--autoscale-lr', + action='store_true', + help='automatically scale lr with the number of gpus') + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.options is not None: + cfg.merge_from_dict(args.options) + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + if args.resume_from is not None: + cfg.resume_from = args.resume_from + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids + else: + cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + + if args.autoscale_lr: + # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) + cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) + + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) + meta['env_info'] = env_info + + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + + # set random seeds + if args.seed is not None: + logger.info(f'Set random seed to {args.seed}, ' + f'deterministic: {args.deterministic}') + set_random_seed(args.seed, deterministic=args.deterministic) + cfg.seed = args.seed + meta['seed'] = args.seed + + model = build_model( + cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg) + + datasets = [build_dataset(cfg.data.train)] + if len(cfg.workflow) == 2: + val_dataset = copy.deepcopy(cfg.data.val) + val_dataset.pipeline = cfg.data.train.pipeline + datasets.append(build_dataset(val_dataset)) + if cfg.checkpoint_config is not None: + # save mmcls version, config file content and class names in + # checkpoints as meta data + cfg.checkpoint_config.meta = dict( + mmcls_version=__version__, + config=cfg.pretty_text, + CLASSES=datasets[0].CLASSES) + # add an attribute for visualization convenience + model.CLASSES = datasets[0].CLASSES + train_model( + model, + datasets, + cfg, + distributed=distributed, + validate=(not args.no_validate), + timestamp=timestamp, + meta=meta) + + +if __name__ == '__main__': + main()