init commit
commit
a99e6eb10d
|
@ -0,0 +1,118 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
**/*.pyc
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
# custom
|
||||
mmcls/version.py
|
||||
data
|
||||
.vscode
|
||||
.idea
|
||||
*.pkl
|
||||
*.pkl.json
|
||||
*.log.json
|
||||
work_dirs/
|
||||
|
||||
# Pytorch
|
||||
*.pth
|
|
@ -0,0 +1,34 @@
|
|||
image: registry.sensetime.com/eig-research/pytorch:1.3.1-cuda10.1-cudnn7-devel
|
||||
|
||||
stages:
|
||||
- linting
|
||||
- test
|
||||
|
||||
before_script:
|
||||
- echo $PATH
|
||||
- gcc --version
|
||||
- python --version
|
||||
- pip --version
|
||||
- nvcc --version
|
||||
- nvidia-smi
|
||||
- python -c "import torch; print(torch.__version__)"
|
||||
|
||||
linting:
|
||||
stage: linting
|
||||
script:
|
||||
- pip install flake8 yapf isort
|
||||
- flake8 .
|
||||
- isort -rc --check-only --diff pytorch-template/ tools/ tests/
|
||||
- yapf -r -d pytorch-template/ tools/ tests/ configs/
|
||||
|
||||
test:
|
||||
stage: test
|
||||
script:
|
||||
- echo "Start building..."
|
||||
- pip install pillow==6.2.2
|
||||
- pip install -e .
|
||||
- python -c "import mmcls; print(mmcls.__version__)"
|
||||
# - echo "Start testing..."
|
||||
# - pip install pytest coverage
|
||||
# - coverage run --source mmcls -m pytest tests/
|
||||
# - coverage report -m
|
|
@ -0,0 +1,27 @@
|
|||
repos:
|
||||
- repo: https://gitlab.com/pycqa/flake8.git
|
||||
rev: 3.8.0
|
||||
hooks:
|
||||
- id: flake8
|
||||
- repo: https://github.com/asottile/seed-isort-config
|
||||
rev: v2.1.0
|
||||
hooks:
|
||||
- id: seed-isort-config
|
||||
- repo: https://github.com/timothycrosley/isort
|
||||
rev: 4.3.21
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/pre-commit/mirrors-yapf
|
||||
rev: v0.29.0
|
||||
hooks:
|
||||
- id: yapf
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v2.5.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: check-yaml
|
||||
- id: end-of-file-fixer
|
||||
- id: requirements-txt-fixer
|
||||
- id: double-quote-string-fixer
|
||||
- id: fix-encoding-pragma
|
||||
args: ["--remove"]
|
|
@ -0,0 +1,59 @@
|
|||
# model settings
|
||||
model = dict(
|
||||
type='xxx',
|
||||
pretrained='torchvision://resnet50',
|
||||
backbone=dict(
|
||||
type='ResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=1,
|
||||
style='pytorch'))
|
||||
# dataset settings
|
||||
dataset_type = 'XXXDataset'
|
||||
img_norm_cfg = dict(
|
||||
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
||||
train_pipeline = []
|
||||
test_pipeline = []
|
||||
data = dict(
|
||||
imgs_per_gpu=2,
|
||||
workers_per_gpu=2,
|
||||
train=dict(
|
||||
type=dataset_type,
|
||||
ann_file='',
|
||||
data_prefix='',
|
||||
pipeline=train_pipeline),
|
||||
val=dict(
|
||||
type=dataset_type, ann_file='', data_prefix='',
|
||||
pipeline=test_pipeline),
|
||||
test=dict(
|
||||
type=dataset_type, ann_file='', data_prefix='',
|
||||
pipeline=test_pipeline))
|
||||
# optimizer
|
||||
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
|
||||
optimizer_config = dict(grad_clip=None)
|
||||
# learning policy
|
||||
lr_config = dict(
|
||||
policy='step',
|
||||
warmup='linear',
|
||||
warmup_iters=500,
|
||||
warmup_ratio=1.0 / 3,
|
||||
step=[8, 11])
|
||||
# checkpoint saving
|
||||
checkpoint_config = dict(interval=1)
|
||||
# yapf:disable
|
||||
log_config = dict(
|
||||
interval=50,
|
||||
hooks=[
|
||||
dict(type='TextLoggerHook'),
|
||||
# dict(type='TensorboardLoggerHook')
|
||||
])
|
||||
# yapf:enable
|
||||
# runtime settings
|
||||
total_epochs = 12
|
||||
dist_params = dict(backend='nccl')
|
||||
log_level = 'INFO'
|
||||
work_dir = './work_dirs/xxx'
|
||||
load_from = None
|
||||
resume_from = None
|
||||
workflow = [('train', 1)]
|
|
@ -0,0 +1 @@
|
|||
0.1.0
|
|
@ -0,0 +1,3 @@
|
|||
from .version import __version__, short_version
|
||||
|
||||
__all__ = ['__version__', 'short_version']
|
|
@ -0,0 +1,8 @@
|
|||
from .inference import inference_model, init_model
|
||||
from .test import multi_gpu_test, single_gpu_test
|
||||
from .train import set_random_seed, train_model
|
||||
|
||||
__all__ = [
|
||||
'set_random_seed', 'train_model', 'init_model', 'inference_model',
|
||||
'multi_gpu_test', 'single_gpu_test'
|
||||
]
|
|
@ -0,0 +1,6 @@
|
|||
def init_model():
|
||||
pass
|
||||
|
||||
|
||||
def inference_model():
|
||||
pass
|
|
@ -0,0 +1,152 @@
|
|||
import os.path as osp
|
||||
import pickle
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
import mmcv
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from mmcv.runner import get_dist_info
|
||||
|
||||
|
||||
def single_gpu_test(model,
|
||||
data_loader,
|
||||
show=False,
|
||||
out_dir=None,
|
||||
show_score_thr=0.3):
|
||||
model.eval()
|
||||
results = []
|
||||
dataset = data_loader.dataset
|
||||
prog_bar = mmcv.ProgressBar(len(dataset))
|
||||
for i, data in enumerate(data_loader):
|
||||
with torch.no_grad():
|
||||
result = model(return_loss=False, rescale=True, **data)
|
||||
results.append(result)
|
||||
|
||||
if show or out_dir:
|
||||
pass # TODO
|
||||
|
||||
batch_size = data['img'][0].size(0)
|
||||
for _ in range(batch_size):
|
||||
prog_bar.update()
|
||||
return results
|
||||
|
||||
|
||||
def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
|
||||
"""Test model with multiple gpus.
|
||||
|
||||
This method tests model with multiple gpus and collects the results
|
||||
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
|
||||
it encodes results to gpu tensors and use gpu communication for results
|
||||
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
|
||||
and collects them by the rank 0 worker.
|
||||
|
||||
Args:
|
||||
model (nn.Module): Model to be tested.
|
||||
data_loader (nn.Dataloader): Pytorch data loader.
|
||||
tmpdir (str): Path of directory to save the temporary results from
|
||||
different gpus under cpu mode.
|
||||
gpu_collect (bool): Option to use either gpu or cpu to collect results.
|
||||
|
||||
Returns:
|
||||
list: The prediction results.
|
||||
"""
|
||||
model.eval()
|
||||
results = []
|
||||
dataset = data_loader.dataset
|
||||
rank, world_size = get_dist_info()
|
||||
if rank == 0:
|
||||
prog_bar = mmcv.ProgressBar(len(dataset))
|
||||
for i, data in enumerate(data_loader):
|
||||
with torch.no_grad():
|
||||
result = model(return_loss=False, rescale=True, **data)
|
||||
results.append(result)
|
||||
|
||||
if rank == 0:
|
||||
batch_size = (
|
||||
len(data['img_meta']._data)
|
||||
if 'img_meta' in data else data['img'][0].size(0))
|
||||
for _ in range(batch_size * world_size):
|
||||
prog_bar.update()
|
||||
|
||||
# collect results from all ranks
|
||||
if gpu_collect:
|
||||
results = collect_results_gpu(results, len(dataset))
|
||||
else:
|
||||
results = collect_results_cpu(results, len(dataset), tmpdir)
|
||||
return results
|
||||
|
||||
|
||||
def collect_results_cpu(result_part, size, tmpdir=None):
|
||||
rank, world_size = get_dist_info()
|
||||
# create a tmp dir if it is not specified
|
||||
if tmpdir is None:
|
||||
MAX_LEN = 512
|
||||
# 32 is whitespace
|
||||
dir_tensor = torch.full((MAX_LEN, ),
|
||||
32,
|
||||
dtype=torch.uint8,
|
||||
device='cuda')
|
||||
if rank == 0:
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
tmpdir = torch.tensor(
|
||||
bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
|
||||
dir_tensor[:len(tmpdir)] = tmpdir
|
||||
dist.broadcast(dir_tensor, 0)
|
||||
tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
|
||||
else:
|
||||
mmcv.mkdir_or_exist(tmpdir)
|
||||
# dump the part result to the dir
|
||||
mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
|
||||
dist.barrier()
|
||||
# collect all parts
|
||||
if rank != 0:
|
||||
return None
|
||||
else:
|
||||
# load results of all parts from tmp dir
|
||||
part_list = []
|
||||
for i in range(world_size):
|
||||
part_file = osp.join(tmpdir, f'part_{i}.pkl')
|
||||
part_list.append(mmcv.load(part_file))
|
||||
# sort the results
|
||||
ordered_results = []
|
||||
for res in zip(*part_list):
|
||||
ordered_results.extend(list(res))
|
||||
# the dataloader may pad some samples
|
||||
ordered_results = ordered_results[:size]
|
||||
# remove tmp dir
|
||||
shutil.rmtree(tmpdir)
|
||||
return ordered_results
|
||||
|
||||
|
||||
def collect_results_gpu(result_part, size):
|
||||
rank, world_size = get_dist_info()
|
||||
# dump result part to tensor with pickle
|
||||
part_tensor = torch.tensor(
|
||||
bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
|
||||
# gather all result part tensor shape
|
||||
shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
|
||||
shape_list = [shape_tensor.clone() for _ in range(world_size)]
|
||||
dist.all_gather(shape_list, shape_tensor)
|
||||
# padding result part tensor to max length
|
||||
shape_max = torch.tensor(shape_list).max()
|
||||
part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
|
||||
part_send[:shape_tensor[0]] = part_tensor
|
||||
part_recv_list = [
|
||||
part_tensor.new_zeros(shape_max) for _ in range(world_size)
|
||||
]
|
||||
# gather all result part
|
||||
dist.all_gather(part_recv_list, part_send)
|
||||
|
||||
if rank == 0:
|
||||
part_list = []
|
||||
for recv, shape in zip(part_recv_list, shape_list):
|
||||
part_list.append(
|
||||
pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
|
||||
# sort the results
|
||||
ordered_results = []
|
||||
for res in zip(*part_list):
|
||||
ordered_results.extend(list(res))
|
||||
# the dataloader may pad some samples
|
||||
ordered_results = ordered_results[:size]
|
||||
return ordered_results
|
|
@ -0,0 +1,162 @@
|
|||
import random
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
|
||||
from mmcv.runner import DistSamplerSeedHook, Runner
|
||||
|
||||
from mmcls.core import (DistEvalHook, DistOptimizerHook, EvalHook,
|
||||
Fp16OptimizerHook, build_optimizer)
|
||||
from mmcls.datasets import build_dataloader, build_dataset
|
||||
from mmcls.utils import get_root_logger
|
||||
|
||||
|
||||
def set_random_seed(seed, deterministic=False):
|
||||
"""Set random seed.
|
||||
Args:
|
||||
seed (int): Seed to be used.
|
||||
deterministic (bool): Whether to set the deterministic option for
|
||||
CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
|
||||
to True and `torch.backends.cudnn.benchmark` to False.
|
||||
Default: False.
|
||||
"""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
if deterministic:
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
|
||||
def parse_losses(losses):
|
||||
log_vars = OrderedDict()
|
||||
for loss_name, loss_value in losses.items():
|
||||
if isinstance(loss_value, torch.Tensor):
|
||||
log_vars[loss_name] = loss_value.mean()
|
||||
elif isinstance(loss_value, list):
|
||||
log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
|
||||
else:
|
||||
raise TypeError(f'{loss_name} is not a tensor or list of tensors')
|
||||
|
||||
loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
|
||||
|
||||
log_vars['loss'] = loss
|
||||
for loss_name, loss_value in log_vars.items():
|
||||
# reduce loss when distributed training
|
||||
if dist.is_available() and dist.is_initialized():
|
||||
loss_value = loss_value.data.clone()
|
||||
dist.all_reduce(loss_value.div_(dist.get_world_size()))
|
||||
log_vars[loss_name] = loss_value.item()
|
||||
|
||||
return loss, log_vars
|
||||
|
||||
|
||||
def batch_processor(model, data, train_mode):
|
||||
"""Process a data batch.
|
||||
This method is required as an argument of Runner, which defines how to
|
||||
process a data batch and obtain proper outputs. The first 3 arguments of
|
||||
batch_processor are fixed.
|
||||
Args:
|
||||
model (nn.Module): A PyTorch model.
|
||||
data (dict): The data batch in a dict.
|
||||
train_mode (bool): Training mode or not. It may be useless for some
|
||||
models.
|
||||
Returns:
|
||||
dict: A dict containing losses and log vars.
|
||||
"""
|
||||
losses = model(**data)
|
||||
loss, log_vars = parse_losses(losses)
|
||||
|
||||
outputs = dict(
|
||||
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def train_model(model,
|
||||
dataset,
|
||||
cfg,
|
||||
distributed=False,
|
||||
validate=False,
|
||||
timestamp=None,
|
||||
meta=None):
|
||||
logger = get_root_logger(cfg.log_level)
|
||||
|
||||
# prepare data loaders
|
||||
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
|
||||
|
||||
data_loaders = [
|
||||
build_dataloader(
|
||||
ds,
|
||||
cfg.data.samples_per_gpu,
|
||||
cfg.data.workers_per_gpu,
|
||||
# cfg.gpus will be ignored if distributed
|
||||
len(cfg.gpu_ids),
|
||||
dist=distributed,
|
||||
seed=cfg.seed) for ds in dataset
|
||||
]
|
||||
|
||||
# put model on gpus
|
||||
if distributed:
|
||||
find_unused_parameters = cfg.get('find_unused_parameters', False)
|
||||
# Sets the `find_unused_parameters` parameter in
|
||||
# torch.nn.parallel.DistributedDataParallel
|
||||
model = MMDistributedDataParallel(
|
||||
model.cuda(),
|
||||
device_ids=[torch.cuda.current_device()],
|
||||
broadcast_buffers=False,
|
||||
find_unused_parameters=find_unused_parameters)
|
||||
else:
|
||||
model = MMDataParallel(
|
||||
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
|
||||
|
||||
# build runner
|
||||
optimizer = build_optimizer(model, cfg.optimizer)
|
||||
runner = Runner(
|
||||
model,
|
||||
batch_processor,
|
||||
optimizer,
|
||||
cfg.work_dir,
|
||||
logger=logger,
|
||||
meta=meta)
|
||||
# an ugly walkaround to make the .log and .log.json filenames the same
|
||||
runner.timestamp = timestamp
|
||||
|
||||
# fp16 setting
|
||||
fp16_cfg = cfg.get('fp16', None)
|
||||
if fp16_cfg is not None:
|
||||
optimizer_config = Fp16OptimizerHook(
|
||||
**cfg.optimizer_config, **fp16_cfg, distributed=distributed)
|
||||
elif distributed and 'type' not in cfg.optimizer_config:
|
||||
optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
|
||||
else:
|
||||
optimizer_config = cfg.optimizer_config
|
||||
|
||||
# register hooks
|
||||
runner.register_training_hooks(cfg.lr_config, optimizer_config,
|
||||
cfg.checkpoint_config, cfg.log_config,
|
||||
cfg.get('momentum_config', None))
|
||||
if distributed:
|
||||
runner.register_hook(DistSamplerSeedHook())
|
||||
|
||||
# register eval hooks
|
||||
if validate:
|
||||
val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
|
||||
val_dataloader = build_dataloader(
|
||||
val_dataset,
|
||||
samples_per_gpu=1,
|
||||
workers_per_gpu=cfg.data.workers_per_gpu,
|
||||
dist=distributed,
|
||||
shuffle=False)
|
||||
eval_cfg = cfg.get('evaluation', {})
|
||||
eval_hook = DistEvalHook if distributed else EvalHook
|
||||
runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
|
||||
|
||||
if cfg.resume_from:
|
||||
runner.resume(cfg.resume_from)
|
||||
elif cfg.load_from:
|
||||
runner.load_checkpoint(cfg.load_from)
|
||||
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
|
|
@ -0,0 +1,3 @@
|
|||
from .evaluation import * # noqa: F401, F403
|
||||
from .fp16 import * # noqa: F401, F403
|
||||
from .utils import * # noqa: F401, F403
|
|
@ -0,0 +1,3 @@
|
|||
from .eval_hooks import EvalHook
|
||||
|
||||
__all__ = ['EvalHook']
|
|
@ -0,0 +1,74 @@
|
|||
import os.path as osp
|
||||
|
||||
from mmcv.runner import Hook
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
|
||||
class EvalHook(Hook):
|
||||
"""Evaluation hook.
|
||||
|
||||
Args:
|
||||
dataloader (DataLoader): A PyTorch dataloader.
|
||||
interval (int): Evaluation interval (by epochs). Default: 1.
|
||||
"""
|
||||
|
||||
def __init__(self, dataloader, interval=1, **eval_kwargs):
|
||||
if not isinstance(dataloader, DataLoader):
|
||||
raise TypeError('dataloader must be a pytorch DataLoader, but got'
|
||||
f' {type(dataloader)}')
|
||||
self.dataloader = dataloader
|
||||
self.interval = interval
|
||||
self.eval_kwargs = eval_kwargs
|
||||
|
||||
def after_train_epoch(self, runner):
|
||||
if not self.every_n_epochs(runner, self.interval):
|
||||
return
|
||||
from mmcls.apis import single_gpu_test
|
||||
results = single_gpu_test(runner.model, self.dataloader, show=False)
|
||||
self.evaluate(runner, results)
|
||||
|
||||
def evaluate(self, runner, results):
|
||||
eval_res = self.dataloader.dataset.evaluate(
|
||||
results, logger=runner.logger, **self.eval_kwargs)
|
||||
for name, val in eval_res.items():
|
||||
runner.log_buffer.output[name] = val
|
||||
runner.log_buffer.ready = True
|
||||
|
||||
|
||||
class DistEvalHook(EvalHook):
|
||||
"""Distributed evaluation hook.
|
||||
|
||||
Args:
|
||||
dataloader (DataLoader): A PyTorch dataloader.
|
||||
interval (int): Evaluation interval (by epochs). Default: 1.
|
||||
tmpdir (str | None): Temporary directory to save the results of all
|
||||
processes. Default: None.
|
||||
gpu_collect (bool): Whether to use gpu or cpu to collect results.
|
||||
Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dataloader,
|
||||
interval=1,
|
||||
gpu_collect=False,
|
||||
**eval_kwargs):
|
||||
if not isinstance(dataloader, DataLoader):
|
||||
raise TypeError('dataloader must be a pytorch DataLoader, but got '
|
||||
f'{type(dataloader)}')
|
||||
self.dataloader = dataloader
|
||||
self.interval = interval
|
||||
self.gpu_collect = gpu_collect
|
||||
self.eval_kwargs = eval_kwargs
|
||||
|
||||
def after_train_epoch(self, runner):
|
||||
if not self.every_n_epochs(runner, self.interval):
|
||||
return
|
||||
from mmcls.apis import multi_gpu_test
|
||||
results = multi_gpu_test(
|
||||
runner.model,
|
||||
self.dataloader,
|
||||
tmpdir=osp.join(runner.work_dir, '.eval_hook'),
|
||||
gpu_collect=self.gpu_collect)
|
||||
if runner.rank == 0:
|
||||
print('\n')
|
||||
self.evaluate(runner, results)
|
|
@ -0,0 +1,4 @@
|
|||
from .decorators import auto_fp16, force_fp32
|
||||
from .hooks import Fp16OptimizerHook, wrap_fp16_model
|
||||
|
||||
__all__ = ['auto_fp16', 'force_fp32', 'Fp16OptimizerHook', 'wrap_fp16_model']
|
|
@ -0,0 +1,160 @@
|
|||
import functools
|
||||
from inspect import getfullargspec
|
||||
|
||||
import torch
|
||||
|
||||
from .utils import cast_tensor_type
|
||||
|
||||
|
||||
def auto_fp16(apply_to=None, out_fp32=False):
|
||||
"""Decorator to enable fp16 training automatically.
|
||||
|
||||
This decorator is useful when you write custom modules and want to support
|
||||
mixed precision training. If inputs arguments are fp32 tensors, they will
|
||||
be converted to fp16 automatically. Arguments other than fp32 tensors are
|
||||
ignored.
|
||||
|
||||
Args:
|
||||
apply_to (Iterable, optional): The argument names to be converted.
|
||||
`None` indicates all arguments.
|
||||
out_fp32 (bool): Whether to convert the output back to fp32.
|
||||
|
||||
:Example:
|
||||
|
||||
class MyModule1(nn.Module)
|
||||
|
||||
# Convert x and y to fp16
|
||||
@auto_fp16()
|
||||
def forward(self, x, y):
|
||||
pass
|
||||
|
||||
class MyModule2(nn.Module):
|
||||
|
||||
# convert pred to fp16
|
||||
@auto_fp16(apply_to=('pred', ))
|
||||
def do_something(self, pred, others):
|
||||
pass
|
||||
"""
|
||||
|
||||
def auto_fp16_wrapper(old_func):
|
||||
|
||||
@functools.wraps(old_func)
|
||||
def new_func(*args, **kwargs):
|
||||
# check if the module has set the attribute `fp16_enabled`, if not,
|
||||
# just fallback to the original method.
|
||||
if not isinstance(args[0], torch.nn.Module):
|
||||
raise TypeError('@auto_fp16 can only be used to decorate the '
|
||||
'method of nn.Module')
|
||||
if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
|
||||
return old_func(*args, **kwargs)
|
||||
# get the arg spec of the decorated method
|
||||
args_info = getfullargspec(old_func)
|
||||
# get the argument names to be casted
|
||||
args_to_cast = args_info.args if apply_to is None else apply_to
|
||||
# convert the args that need to be processed
|
||||
new_args = []
|
||||
# NOTE: default args are not taken into consideration
|
||||
if args:
|
||||
arg_names = args_info.args[:len(args)]
|
||||
for i, arg_name in enumerate(arg_names):
|
||||
if arg_name in args_to_cast:
|
||||
new_args.append(
|
||||
cast_tensor_type(args[i], torch.float, torch.half))
|
||||
else:
|
||||
new_args.append(args[i])
|
||||
# convert the kwargs that need to be processed
|
||||
new_kwargs = {}
|
||||
if kwargs:
|
||||
for arg_name, arg_value in kwargs.items():
|
||||
if arg_name in args_to_cast:
|
||||
new_kwargs[arg_name] = cast_tensor_type(
|
||||
arg_value, torch.float, torch.half)
|
||||
else:
|
||||
new_kwargs[arg_name] = arg_value
|
||||
# apply converted arguments to the decorated method
|
||||
output = old_func(*new_args, **new_kwargs)
|
||||
# cast the results back to fp32 if necessary
|
||||
if out_fp32:
|
||||
output = cast_tensor_type(output, torch.half, torch.float)
|
||||
return output
|
||||
|
||||
return new_func
|
||||
|
||||
return auto_fp16_wrapper
|
||||
|
||||
|
||||
def force_fp32(apply_to=None, out_fp16=False):
|
||||
"""Decorator to convert input arguments to fp32 in force.
|
||||
|
||||
This decorator is useful when you write custom modules and want to support
|
||||
mixed precision training. If there are some inputs that must be processed
|
||||
in fp32 mode, then this decorator can handle it. If inputs arguments are
|
||||
fp16 tensors, they will be converted to fp32 automatically. Arguments other
|
||||
than fp16 tensors are ignored.
|
||||
|
||||
Args:
|
||||
apply_to (Iterable, optional): The argument names to be converted.
|
||||
`None` indicates all arguments.
|
||||
out_fp16 (bool): Whether to convert the output back to fp16.
|
||||
|
||||
:Example:
|
||||
|
||||
class MyModule1(nn.Module)
|
||||
|
||||
# Convert x and y to fp32
|
||||
@force_fp32()
|
||||
def loss(self, x, y):
|
||||
pass
|
||||
|
||||
class MyModule2(nn.Module):
|
||||
|
||||
# convert pred to fp32
|
||||
@force_fp32(apply_to=('pred', ))
|
||||
def post_process(self, pred, others):
|
||||
pass
|
||||
"""
|
||||
|
||||
def force_fp32_wrapper(old_func):
|
||||
|
||||
@functools.wraps(old_func)
|
||||
def new_func(*args, **kwargs):
|
||||
# check if the module has set the attribute `fp16_enabled`, if not,
|
||||
# just fallback to the original method.
|
||||
if not isinstance(args[0], torch.nn.Module):
|
||||
raise TypeError('@force_fp32 can only be used to decorate the '
|
||||
'method of nn.Module')
|
||||
if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
|
||||
return old_func(*args, **kwargs)
|
||||
# get the arg spec of the decorated method
|
||||
args_info = getfullargspec(old_func)
|
||||
# get the argument names to be casted
|
||||
args_to_cast = args_info.args if apply_to is None else apply_to
|
||||
# convert the args that need to be processed
|
||||
new_args = []
|
||||
if args:
|
||||
arg_names = args_info.args[:len(args)]
|
||||
for i, arg_name in enumerate(arg_names):
|
||||
if arg_name in args_to_cast:
|
||||
new_args.append(
|
||||
cast_tensor_type(args[i], torch.half, torch.float))
|
||||
else:
|
||||
new_args.append(args[i])
|
||||
# convert the kwargs that need to be processed
|
||||
new_kwargs = dict()
|
||||
if kwargs:
|
||||
for arg_name, arg_value in kwargs.items():
|
||||
if arg_name in args_to_cast:
|
||||
new_kwargs[arg_name] = cast_tensor_type(
|
||||
arg_value, torch.half, torch.float)
|
||||
else:
|
||||
new_kwargs[arg_name] = arg_value
|
||||
# apply converted arguments to the decorated method
|
||||
output = old_func(*new_args, **new_kwargs)
|
||||
# cast the results back to fp32 if necessary
|
||||
if out_fp16:
|
||||
output = cast_tensor_type(output, torch.float, torch.half)
|
||||
return output
|
||||
|
||||
return new_func
|
||||
|
||||
return force_fp32_wrapper
|
|
@ -0,0 +1,127 @@
|
|||
import copy
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from mmcv.runner import OptimizerHook
|
||||
|
||||
from ..dist_utils import allreduce_grads
|
||||
from .utils import cast_tensor_type
|
||||
|
||||
|
||||
class Fp16OptimizerHook(OptimizerHook):
|
||||
"""FP16 optimizer hook.
|
||||
|
||||
The steps of fp16 optimizer is as follows.
|
||||
1. Scale the loss value.
|
||||
2. BP in the fp16 model.
|
||||
2. Copy gradients from fp16 model to fp32 weights.
|
||||
3. Update fp32 weights.
|
||||
4. Copy updated parameters from fp32 weights to fp16 model.
|
||||
|
||||
Refer to https://arxiv.org/abs/1710.03740 for more details.
|
||||
|
||||
Args:
|
||||
loss_scale (float): Scale factor multiplied with loss.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
grad_clip=None,
|
||||
coalesce=True,
|
||||
bucket_size_mb=-1,
|
||||
loss_scale=512.,
|
||||
distributed=True):
|
||||
self.grad_clip = grad_clip
|
||||
self.coalesce = coalesce
|
||||
self.bucket_size_mb = bucket_size_mb
|
||||
self.loss_scale = loss_scale
|
||||
self.distributed = distributed
|
||||
|
||||
def before_run(self, runner):
|
||||
# keep a copy of fp32 weights
|
||||
runner.optimizer.param_groups = copy.deepcopy(
|
||||
runner.optimizer.param_groups)
|
||||
# convert model to fp16
|
||||
wrap_fp16_model(runner.model)
|
||||
|
||||
def copy_grads_to_fp32(self, fp16_net, fp32_weights):
|
||||
"""Copy gradients from fp16 model to fp32 weight copy."""
|
||||
for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()):
|
||||
if fp16_param.grad is not None:
|
||||
if fp32_param.grad is None:
|
||||
fp32_param.grad = fp32_param.data.new(fp32_param.size())
|
||||
fp32_param.grad.copy_(fp16_param.grad)
|
||||
|
||||
def copy_params_to_fp16(self, fp16_net, fp32_weights):
|
||||
"""Copy updated params from fp32 weight copy to fp16 model."""
|
||||
for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights):
|
||||
fp16_param.data.copy_(fp32_param.data)
|
||||
|
||||
def after_train_iter(self, runner):
|
||||
# clear grads of last iteration
|
||||
runner.model.zero_grad()
|
||||
runner.optimizer.zero_grad()
|
||||
# scale the loss value
|
||||
scaled_loss = runner.outputs['loss'] * self.loss_scale
|
||||
scaled_loss.backward()
|
||||
# copy fp16 grads in the model to fp32 params in the optimizer
|
||||
fp32_weights = []
|
||||
for param_group in runner.optimizer.param_groups:
|
||||
fp32_weights += param_group['params']
|
||||
self.copy_grads_to_fp32(runner.model, fp32_weights)
|
||||
# allreduce grads
|
||||
if self.distributed:
|
||||
allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb)
|
||||
# scale the gradients back
|
||||
for param in fp32_weights:
|
||||
if param.grad is not None:
|
||||
param.grad.div_(self.loss_scale)
|
||||
if self.grad_clip is not None:
|
||||
self.clip_grads(fp32_weights)
|
||||
# update fp32 params
|
||||
runner.optimizer.step()
|
||||
# copy fp32 params to the fp16 model
|
||||
self.copy_params_to_fp16(runner.model, fp32_weights)
|
||||
|
||||
|
||||
def wrap_fp16_model(model):
|
||||
# convert model to fp16
|
||||
model.half()
|
||||
# patch the normalization layers to make it work in fp32 mode
|
||||
patch_norm_fp32(model)
|
||||
# set `fp16_enabled` flag
|
||||
for m in model.modules():
|
||||
if hasattr(m, 'fp16_enabled'):
|
||||
m.fp16_enabled = True
|
||||
|
||||
|
||||
def patch_norm_fp32(module):
|
||||
if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)):
|
||||
module.float()
|
||||
module.forward = patch_forward_method(module.forward, torch.half,
|
||||
torch.float)
|
||||
for child in module.children():
|
||||
patch_norm_fp32(child)
|
||||
return module
|
||||
|
||||
|
||||
def patch_forward_method(func, src_type, dst_type, convert_output=True):
|
||||
"""Patch the forward method of a module.
|
||||
|
||||
Args:
|
||||
func (callable): The original forward method.
|
||||
src_type (torch.dtype): Type of input arguments to be converted from.
|
||||
dst_type (torch.dtype): Type of input arguments to be converted to.
|
||||
convert_output (bool): Whether to convert the output back to src_type.
|
||||
|
||||
Returns:
|
||||
callable: The patched forward method.
|
||||
"""
|
||||
|
||||
def new_forward(*args, **kwargs):
|
||||
output = func(*cast_tensor_type(args, src_type, dst_type),
|
||||
**cast_tensor_type(kwargs, src_type, dst_type))
|
||||
if convert_output:
|
||||
output = cast_tensor_type(output, dst_type, src_type)
|
||||
return output
|
||||
|
||||
return new_forward
|
|
@ -0,0 +1,23 @@
|
|||
from collections import abc
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def cast_tensor_type(inputs, src_type, dst_type):
|
||||
if isinstance(inputs, torch.Tensor):
|
||||
return inputs.to(dst_type)
|
||||
elif isinstance(inputs, str):
|
||||
return inputs
|
||||
elif isinstance(inputs, np.ndarray):
|
||||
return inputs
|
||||
elif isinstance(inputs, abc.Mapping):
|
||||
return type(inputs)({
|
||||
k: cast_tensor_type(v, src_type, dst_type)
|
||||
for k, v in inputs.items()
|
||||
})
|
||||
elif isinstance(inputs, abc.Iterable):
|
||||
return type(inputs)(
|
||||
cast_tensor_type(item, src_type, dst_type) for item in inputs)
|
||||
else:
|
||||
return inputs
|
|
@ -0,0 +1,3 @@
|
|||
from .dist_utils import DistOptimizerHook
|
||||
|
||||
__all__ = ['DistOptimizerHook']
|
|
@ -0,0 +1,16 @@
|
|||
from mmcv.runner import OptimizerHook
|
||||
|
||||
|
||||
class DistOptimizerHook(OptimizerHook):
|
||||
|
||||
def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
|
||||
self.grad_clip = grad_clip
|
||||
self.coalesce = coalesce
|
||||
self.bucket_size_mb = bucket_size_mb
|
||||
|
||||
def after_train_iter(self, runner):
|
||||
runner.optimizer.zero_grad()
|
||||
runner.outputs['loss'].backward()
|
||||
if self.grad_clip is not None:
|
||||
self.clip_grads(runner.model.parameters())
|
||||
runner.optimizer.step()
|
|
@ -0,0 +1,9 @@
|
|||
from .base_dataset import BaseDataset
|
||||
from .builder import build_dataloader, build_dataset
|
||||
from .pipelines import Compose
|
||||
from .samplers import DistributedSampler
|
||||
|
||||
__all__ = [
|
||||
'BaseDataset', 'build_dataloader', 'build_dataset', 'Compose',
|
||||
'DistributedSampler'
|
||||
]
|
|
@ -0,0 +1,39 @@
|
|||
import copy
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from .pipelines import Compose
|
||||
|
||||
|
||||
class BaseDataset(Dataset, metaclass=ABCMeta):
|
||||
|
||||
def __init__(self, ann_file, pipeline, data_prefix, test_mode):
|
||||
super(BaseDataset, self).__init__()
|
||||
|
||||
self.ann_file = ann_file
|
||||
self.data_prefix = data_prefix
|
||||
self.test_mode = test_mode
|
||||
self.pipeline = Compose(pipeline)
|
||||
self.data_infos = self.load_annotations()
|
||||
|
||||
@abstractmethod
|
||||
def load_annotations(self):
|
||||
pass
|
||||
|
||||
def prepare_train_data(self, idx):
|
||||
results = copy.deepcopy(self.data_infos[idx])
|
||||
return self.pipeline(results)
|
||||
|
||||
def prepare_test_data(self, idx):
|
||||
results = copy.deepcopy(self.data_infos[idx])
|
||||
return self.pipeline(results)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data_infos)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
if self.test_mode:
|
||||
return self.prepare_train_data(idx)
|
||||
else:
|
||||
return self.prepare_test_data(idx)
|
|
@ -0,0 +1,100 @@
|
|||
import platform
|
||||
import random
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
from mmcv.parallel import collate
|
||||
from mmcv.runner import get_dist_info
|
||||
from mmcv.utils import Registry, build_from_cfg
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from .samplers import DistributedSampler
|
||||
|
||||
if platform.system() != 'Windows':
|
||||
# https://github.com/pytorch/pytorch/issues/973
|
||||
import resource
|
||||
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
|
||||
hard_limit = rlimit[1]
|
||||
soft_limit = min(4096, hard_limit)
|
||||
resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
|
||||
|
||||
DATASETS = Registry('dataset')
|
||||
PIPELINES = Registry('pipeline')
|
||||
|
||||
|
||||
def build_dataset(cfg, default_args=None):
|
||||
from .dataset_wrappers import ConcatDataset, RepeatDataset
|
||||
if isinstance(cfg, (list, tuple)):
|
||||
dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
|
||||
elif cfg['type'] == 'RepeatDataset':
|
||||
dataset = RepeatDataset(
|
||||
build_dataset(cfg['dataset'], default_args), cfg['times'])
|
||||
else:
|
||||
dataset = build_from_cfg(cfg, DATASETS, default_args)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def build_dataloader(dataset,
|
||||
samples_per_gpu,
|
||||
workers_per_gpu,
|
||||
num_gpus=1,
|
||||
dist=True,
|
||||
shuffle=True,
|
||||
seed=None,
|
||||
**kwargs):
|
||||
"""Build PyTorch DataLoader.
|
||||
|
||||
In distributed training, each GPU/process has a dataloader.
|
||||
In non-distributed training, there is only one dataloader for all GPUs.
|
||||
|
||||
Args:
|
||||
dataset (Dataset): A PyTorch dataset.
|
||||
samples_per_gpu (int): Number of training samples on each GPU, i.e.,
|
||||
batch size of each GPU.
|
||||
workers_per_gpu (int): How many subprocesses to use for data loading
|
||||
for each GPU.
|
||||
num_gpus (int): Number of GPUs. Only used in non-distributed training.
|
||||
dist (bool): Distributed training/test or not. Default: True.
|
||||
shuffle (bool): Whether to shuffle the data at every epoch.
|
||||
Default: True.
|
||||
kwargs: any keyword argument to be used to initialize DataLoader
|
||||
|
||||
Returns:
|
||||
DataLoader: A PyTorch dataloader.
|
||||
"""
|
||||
rank, world_size = get_dist_info()
|
||||
if dist:
|
||||
sampler = DistributedSampler(
|
||||
dataset, world_size, rank, shuffle=shuffle)
|
||||
shuffle = False
|
||||
batch_size = samples_per_gpu
|
||||
num_workers = workers_per_gpu
|
||||
else:
|
||||
sampler = None
|
||||
batch_size = num_gpus * samples_per_gpu
|
||||
num_workers = num_gpus * workers_per_gpu
|
||||
|
||||
init_fn = partial(
|
||||
worker_init_fn, num_workers=num_workers, rank=rank,
|
||||
seed=seed) if seed is not None else None
|
||||
|
||||
data_loader = DataLoader(
|
||||
dataset,
|
||||
batch_size=batch_size,
|
||||
sampler=sampler,
|
||||
num_workers=num_workers,
|
||||
collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
|
||||
shuffle=shuffle,
|
||||
worker_init_fn=init_fn,
|
||||
**kwargs)
|
||||
|
||||
return data_loader
|
||||
|
||||
|
||||
def worker_init_fn(worker_id, num_workers, rank, seed):
|
||||
# The seed of each worker equals to
|
||||
# num_worker * rank + worker_id + user_seed
|
||||
worker_seed = num_workers * rank + worker_id + seed
|
||||
np.random.seed(worker_seed)
|
||||
random.seed(worker_seed)
|
|
@ -0,0 +1,3 @@
|
|||
from .compose import Compose
|
||||
|
||||
__all__ = ['Compose']
|
|
@ -0,0 +1,42 @@
|
|||
from collections.abc import Sequence
|
||||
|
||||
from mmcv.utils import build_from_cfg
|
||||
|
||||
from ..registry import PIPELINES
|
||||
|
||||
|
||||
@PIPELINES.register_module
|
||||
class Compose(object):
|
||||
"""Compose a data pipeline with a sequence of transforms.
|
||||
|
||||
Args:
|
||||
transforms (list[dict | callable]):
|
||||
Either config dicts of transforms or transform objects.
|
||||
"""
|
||||
|
||||
def __init__(self, transforms):
|
||||
assert isinstance(transforms, Sequence)
|
||||
self.transforms = []
|
||||
for transform in transforms:
|
||||
if isinstance(transform, dict):
|
||||
transform = build_from_cfg(transform, PIPELINES)
|
||||
self.transforms.append(transform)
|
||||
elif callable(transform):
|
||||
self.transforms.append(transform)
|
||||
else:
|
||||
raise TypeError('transform must be callable or a dict, but got'
|
||||
f' {type(transform)}')
|
||||
|
||||
def __call__(self, data):
|
||||
for t in self.transforms:
|
||||
data = t(data)
|
||||
if data is None:
|
||||
return None
|
||||
return data
|
||||
|
||||
def __repr__(self):
|
||||
format_string = self.__class__.__name__ + '('
|
||||
for t in self.transforms:
|
||||
format_string += f'\n {t}'
|
||||
format_string += '\n)'
|
||||
return format_string
|
|
@ -0,0 +1,3 @@
|
|||
from .distributed_sampler import DistributedSampler
|
||||
|
||||
__all__ = ['DistributedSampler']
|
|
@ -0,0 +1,28 @@
|
|||
import torch
|
||||
from torch.utils.data import DistributedSampler as _DistributedSampler
|
||||
|
||||
|
||||
class DistributedSampler(_DistributedSampler):
|
||||
|
||||
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
|
||||
super().__init__(dataset, num_replicas=num_replicas, rank=rank)
|
||||
self.shuffle = shuffle
|
||||
|
||||
def __iter__(self):
|
||||
# deterministically shuffle based on epoch
|
||||
if self.shuffle:
|
||||
g = torch.Generator()
|
||||
g.manual_seed(self.epoch)
|
||||
indices = torch.randperm(len(self.dataset), generator=g).tolist()
|
||||
else:
|
||||
indices = torch.arange(len(self.dataset)).tolist()
|
||||
|
||||
# add extra samples to make it evenly divisible
|
||||
indices += indices[:(self.total_size - len(indices))]
|
||||
assert len(indices) == self.total_size
|
||||
|
||||
# subsample
|
||||
indices = indices[self.rank:self.total_size:self.num_replicas]
|
||||
assert len(indices) == self.num_samples
|
||||
|
||||
return iter(indices)
|
|
@ -0,0 +1,4 @@
|
|||
from .builder import build_model
|
||||
from .registry import MODELS
|
||||
|
||||
__all__ = ['build_model', 'MODELS']
|
|
@ -0,0 +1,18 @@
|
|||
import torch.nn as nn
|
||||
from mmcv.utils import Registry, build_from_cfg
|
||||
|
||||
MODELS = Registry('model')
|
||||
|
||||
|
||||
def build(cfg, registry, default_args=None):
|
||||
if isinstance(cfg, list):
|
||||
modules = [
|
||||
build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
|
||||
]
|
||||
return nn.Sequential(*modules)
|
||||
else:
|
||||
return build_from_cfg(cfg, registry, default_args)
|
||||
|
||||
|
||||
def build_model(cfg, train_cfg=None, test_cfg=None):
|
||||
return build(cfg, MODELS, dict(train_cfg=train_cfg, test_cfg=test_cfg))
|
|
@ -0,0 +1,3 @@
|
|||
from mmcv.utils import Registry
|
||||
|
||||
MODELS = Registry('model')
|
|
@ -0,0 +1,4 @@
|
|||
from .collect_env import collect_env
|
||||
from .logger import get_root_logger
|
||||
|
||||
__all__ = ['collect_env', 'get_root_logger']
|
|
@ -0,0 +1,61 @@
|
|||
import os.path as osp
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
import cv2
|
||||
import mmcv
|
||||
import torch
|
||||
import torchvision
|
||||
|
||||
import mmcls
|
||||
|
||||
|
||||
def collect_env():
|
||||
env_info = {}
|
||||
env_info['sys.platform'] = sys.platform
|
||||
env_info['Python'] = sys.version.replace('\n', '')
|
||||
|
||||
cuda_available = torch.cuda.is_available()
|
||||
env_info['CUDA available'] = cuda_available
|
||||
|
||||
if cuda_available:
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
env_info['CUDA_HOME'] = CUDA_HOME
|
||||
|
||||
if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
|
||||
try:
|
||||
nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
|
||||
nvcc = subprocess.check_output(
|
||||
f'"{nvcc}" -V | tail -n1', shell=True)
|
||||
nvcc = nvcc.decode('utf-8').strip()
|
||||
except subprocess.SubprocessError:
|
||||
nvcc = 'Not Available'
|
||||
env_info['NVCC'] = nvcc
|
||||
|
||||
devices = defaultdict(list)
|
||||
for k in range(torch.cuda.device_count()):
|
||||
devices[torch.cuda.get_device_name(k)].append(str(k))
|
||||
for name, devids in devices.items():
|
||||
env_info['GPU ' + ','.join(devids)] = name
|
||||
|
||||
gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
|
||||
gcc = gcc.decode('utf-8').strip()
|
||||
env_info['GCC'] = gcc
|
||||
|
||||
env_info['PyTorch'] = torch.__version__
|
||||
env_info['PyTorch compiling details'] = torch.__config__.show()
|
||||
|
||||
env_info['TorchVision'] = torchvision.__version__
|
||||
|
||||
env_info['OpenCV'] = cv2.__version__
|
||||
|
||||
env_info['MMCV'] = mmcv.__version__
|
||||
env_info['mmcls'] = mmcls.__version__
|
||||
|
||||
return env_info
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for name, val in collect_env().items():
|
||||
print(f'{name}: {val}')
|
|
@ -0,0 +1,7 @@
|
|||
import logging
|
||||
|
||||
from mmcv.utils import get_logger
|
||||
|
||||
|
||||
def get_root_logger(log_file=None, log_level=logging.INFO):
|
||||
return get_logger('mmcls', log_file, log_level)
|
|
@ -0,0 +1,4 @@
|
|||
mmcv>=0.3.0
|
||||
numpy
|
||||
torch>=1.1
|
||||
torchvision
|
|
@ -0,0 +1,22 @@
|
|||
[bdist_wheel]
|
||||
universal=1
|
||||
|
||||
[aliases]
|
||||
test=pytest
|
||||
|
||||
[tool:pytest]
|
||||
addopts=tests/
|
||||
|
||||
[yapf]
|
||||
based_on_style = pep8
|
||||
blank_line_before_nested_class_or_def = true
|
||||
split_before_expression_after_opening_paren = true
|
||||
|
||||
[isort]
|
||||
line_length = 79
|
||||
multi_line_output = 0
|
||||
known_standard_library = pkg_resources,setuptools
|
||||
known_first_party = mmcls
|
||||
known_third_party = cv2,mmcv,numpy,torch,torchvision
|
||||
no_lines_before = STDLIB,LOCALFOLDER
|
||||
default_section = THIRDPARTY
|
|
@ -0,0 +1,111 @@
|
|||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
||||
def readme():
|
||||
with open('README.md', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
return content
|
||||
|
||||
|
||||
version_file = 'mmcls/version.py'
|
||||
|
||||
|
||||
def get_git_hash():
|
||||
|
||||
def _minimal_ext_cmd(cmd):
|
||||
# construct minimal environment
|
||||
env = {}
|
||||
for k in ['SYSTEMROOT', 'PATH', 'HOME']:
|
||||
v = os.environ.get(k)
|
||||
if v is not None:
|
||||
env[k] = v
|
||||
# LANGUAGE is used on win32
|
||||
env['LANGUAGE'] = 'C'
|
||||
env['LANG'] = 'C'
|
||||
env['LC_ALL'] = 'C'
|
||||
out = subprocess.Popen(
|
||||
cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
|
||||
return out
|
||||
|
||||
try:
|
||||
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
|
||||
sha = out.strip().decode('ascii')
|
||||
except OSError:
|
||||
sha = 'unknown'
|
||||
|
||||
return sha
|
||||
|
||||
|
||||
def get_hash():
|
||||
if os.path.exists('.git'):
|
||||
sha = get_git_hash()[:7]
|
||||
elif os.path.exists(version_file):
|
||||
try:
|
||||
from mmcls.version import __version__
|
||||
sha = __version__.split('+')[-1]
|
||||
except ImportError:
|
||||
raise ImportError('Unable to get git version')
|
||||
else:
|
||||
sha = 'unknown'
|
||||
|
||||
return sha
|
||||
|
||||
|
||||
def write_version_py():
|
||||
content = """# GENERATED VERSION FILE
|
||||
# TIME: {}
|
||||
__version__ = '{}'
|
||||
short_version = '{}'
|
||||
version_info = ({})
|
||||
"""
|
||||
sha = get_hash()
|
||||
with open('mmcls/VERSION', 'r') as f:
|
||||
SHORT_VERSION = f.read().strip()
|
||||
VERSION_INFO = ', '.join(SHORT_VERSION.split('.'))
|
||||
VERSION = SHORT_VERSION + '+' + sha
|
||||
|
||||
version_file_str = content.format(time.asctime(), VERSION, SHORT_VERSION,
|
||||
VERSION_INFO)
|
||||
with open(version_file, 'w') as f:
|
||||
f.write(version_file_str)
|
||||
|
||||
|
||||
def get_version():
|
||||
with open(version_file, 'r') as f:
|
||||
exec(compile(f.read(), version_file, 'exec'))
|
||||
return locals()['__version__']
|
||||
|
||||
|
||||
def get_requirements(filename='requirements.txt'):
|
||||
here = os.path.dirname(os.path.realpath(__file__))
|
||||
with open(os.path.join(here, filename), 'r') as f:
|
||||
requires = [line.replace('\n', '') for line in f.readlines()]
|
||||
return requires
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
write_version_py()
|
||||
setup(
|
||||
name='mmcls',
|
||||
version=get_version(),
|
||||
description='A template for pytorch projects.',
|
||||
long_description=readme(),
|
||||
packages=find_packages(exclude=('configs', 'tools', 'demo')),
|
||||
package_data={'mmcls.ops': ['*/*.so']},
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Operating System :: OS Independent',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
],
|
||||
license='Apache License 2.0',
|
||||
setup_requires=['pytest-runner', 'cython', 'numpy'],
|
||||
tests_require=['pytest', 'xdoctest'],
|
||||
install_requires=get_requirements(),
|
||||
zip_safe=False)
|
|
@ -0,0 +1,10 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
CONFIG=$1
|
||||
CHECKPOINT=$2
|
||||
GPUS=$3
|
||||
PORT=${PORT:-29500}
|
||||
|
||||
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
|
||||
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
|
||||
$(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
|
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
CONFIG=$1
|
||||
GPUS=$2
|
||||
PORT=${PORT:-29500}
|
||||
|
||||
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
|
||||
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
|
||||
$(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
|
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -x
|
||||
|
||||
PARTITION=$1
|
||||
JOB_NAME=$2
|
||||
CONFIG=$3
|
||||
CHECKPOINT=$4
|
||||
GPUS=${GPUS:-8}
|
||||
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
|
||||
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
|
||||
PY_ARGS=${@:5}
|
||||
SRUN_ARGS=${SRUN_ARGS:-""}
|
||||
|
||||
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
|
||||
srun -p ${PARTITION} \
|
||||
--job-name=${JOB_NAME} \
|
||||
--gres=gpu:${GPUS_PER_NODE} \
|
||||
--ntasks=${GPUS} \
|
||||
--ntasks-per-node=${GPUS_PER_NODE} \
|
||||
--cpus-per-task=${CPUS_PER_TASK} \
|
||||
--kill-on-bad-exit=1 \
|
||||
${SRUN_ARGS} \
|
||||
python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
|
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -x
|
||||
|
||||
PARTITION=$1
|
||||
JOB_NAME=$2
|
||||
CONFIG=$3
|
||||
WORK_DIR=$4
|
||||
GPUS=${GPUS:-8}
|
||||
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
|
||||
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
|
||||
SRUN_ARGS=${SRUN_ARGS:-""}
|
||||
PY_ARGS=${@:5}
|
||||
|
||||
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
|
||||
srun -p ${PARTITION} \
|
||||
--job-name=${JOB_NAME} \
|
||||
--gres=gpu:${GPUS_PER_NODE} \
|
||||
--ntasks=${GPUS} \
|
||||
--ntasks-per-node=${GPUS_PER_NODE} \
|
||||
--cpus-per-task=${CPUS_PER_TASK} \
|
||||
--kill-on-bad-exit=1 \
|
||||
${SRUN_ARGS} \
|
||||
python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
|
|
@ -0,0 +1,94 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
import mmcv
|
||||
import torch
|
||||
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
|
||||
from mmcv.runner import get_dist_info, init_dist, load_checkpoint
|
||||
|
||||
from mmcls.core import multi_gpu_test, single_gpu_test, wrap_fp16_model
|
||||
from mmcls.datasets import build_dataloader, build_dataset
|
||||
from mmcls.models import build_model
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='mmcls test model')
|
||||
parser.add_argument('config', help='test config file path')
|
||||
parser.add_argument('checkpoint', help='checkpoint file')
|
||||
parser.add_argument('--out', help='output result file')
|
||||
parser.add_argument(
|
||||
'--eval',
|
||||
type=str,
|
||||
nargs='+',
|
||||
choices=['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'],
|
||||
help='eval types')
|
||||
parser.add_argument(
|
||||
'--gpu_collect',
|
||||
action='store_true',
|
||||
help='whether to use gpu to collect results')
|
||||
parser.add_argument('--tmpdir', help='tmp dir for writing some results')
|
||||
parser.add_argument(
|
||||
'--launcher',
|
||||
choices=['none', 'pytorch', 'slurm', 'mpi'],
|
||||
default='none',
|
||||
help='job launcher')
|
||||
parser.add_argument('--local_rank', type=int, default=0)
|
||||
args = parser.parse_args()
|
||||
if 'LOCAL_RANK' not in os.environ:
|
||||
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
cfg = mmcv.Config.fromfile(args.config)
|
||||
# set cudnn_benchmark
|
||||
if cfg.get('cudnn_benchmark', False):
|
||||
torch.backends.cudnn.benchmark = True
|
||||
cfg.model.pretrained = None
|
||||
cfg.data.test.test_mode = True
|
||||
|
||||
# init distributed env first, since logger depends on the dist info.
|
||||
if args.launcher == 'none':
|
||||
distributed = False
|
||||
else:
|
||||
distributed = True
|
||||
init_dist(args.launcher, **cfg.dist_params)
|
||||
|
||||
# build the dataloader
|
||||
# TODO: support multiple images per gpu (only minor changes are needed)
|
||||
dataset = build_dataset(cfg.data.test)
|
||||
data_loader = build_dataloader(
|
||||
dataset,
|
||||
imgs_per_gpu=1,
|
||||
workers_per_gpu=cfg.data.workers_per_gpu,
|
||||
dist=distributed,
|
||||
shuffle=False)
|
||||
|
||||
# build the model and load checkpoint
|
||||
model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
|
||||
fp16_cfg = cfg.get('fp16', None)
|
||||
if fp16_cfg is not None:
|
||||
wrap_fp16_model(model)
|
||||
_ = load_checkpoint(model, args.checkpoint, map_location='cpu')
|
||||
|
||||
if not distributed:
|
||||
model = MMDataParallel(model, device_ids=[0])
|
||||
outputs = single_gpu_test(model, data_loader)
|
||||
else:
|
||||
model = MMDistributedDataParallel(
|
||||
model.cuda(),
|
||||
device_ids=[torch.cuda.current_device()],
|
||||
broadcast_buffers=False)
|
||||
outputs = multi_gpu_test(model, data_loader, args.tmpdir,
|
||||
args.gpu_collect)
|
||||
|
||||
rank, _ = get_dist_info()
|
||||
if args.out and rank == 0:
|
||||
print(f'\nwriting results to {args.out}')
|
||||
mmcv.dump(outputs, args.out)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,159 @@
|
|||
import argparse
|
||||
import copy
|
||||
import os
|
||||
import os.path as osp
|
||||
import time
|
||||
|
||||
import mmcv
|
||||
import torch
|
||||
from mmcv import Config, DictAction
|
||||
from mmcv.runner import init_dist
|
||||
|
||||
from mmcls import __version__
|
||||
from mmcls.apis import set_random_seed, train_model
|
||||
from mmcls.datasets import build_dataset
|
||||
from mmcls.models import build_model
|
||||
from mmcls.utils import collect_env, get_root_logger
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Train a model')
|
||||
parser.add_argument('config', help='train config file path')
|
||||
parser.add_argument('--work-dir', help='the dir to save logs and models')
|
||||
parser.add_argument(
|
||||
'--resume-from', help='the checkpoint file to resume from')
|
||||
parser.add_argument(
|
||||
'--no-validate',
|
||||
action='store_true',
|
||||
help='whether not to evaluate the checkpoint during training')
|
||||
group_gpus = parser.add_mutually_exclusive_group()
|
||||
group_gpus.add_argument(
|
||||
'--gpus',
|
||||
type=int,
|
||||
help='number of gpus to use '
|
||||
'(only applicable to non-distributed training)')
|
||||
group_gpus.add_argument(
|
||||
'--gpu-ids',
|
||||
type=int,
|
||||
nargs='+',
|
||||
help='ids of gpus to use '
|
||||
'(only applicable to non-distributed training)')
|
||||
parser.add_argument('--seed', type=int, default=None, help='random seed')
|
||||
parser.add_argument(
|
||||
'--deterministic',
|
||||
action='store_true',
|
||||
help='whether to set deterministic options for CUDNN backend.')
|
||||
parser.add_argument(
|
||||
'--options', nargs='+', action=DictAction, help='arguments in dict')
|
||||
parser.add_argument(
|
||||
'--launcher',
|
||||
choices=['none', 'pytorch', 'slurm', 'mpi'],
|
||||
default='none',
|
||||
help='job launcher')
|
||||
parser.add_argument('--local_rank', type=int, default=0)
|
||||
parser.add_argument(
|
||||
'--autoscale-lr',
|
||||
action='store_true',
|
||||
help='automatically scale lr with the number of gpus')
|
||||
args = parser.parse_args()
|
||||
if 'LOCAL_RANK' not in os.environ:
|
||||
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
cfg = Config.fromfile(args.config)
|
||||
if args.options is not None:
|
||||
cfg.merge_from_dict(args.options)
|
||||
# set cudnn_benchmark
|
||||
if cfg.get('cudnn_benchmark', False):
|
||||
torch.backends.cudnn.benchmark = True
|
||||
|
||||
# work_dir is determined in this priority: CLI > segment in file > filename
|
||||
if args.work_dir is not None:
|
||||
# update configs according to CLI args if args.work_dir is not None
|
||||
cfg.work_dir = args.work_dir
|
||||
elif cfg.get('work_dir', None) is None:
|
||||
# use config filename as default work_dir if cfg.work_dir is None
|
||||
cfg.work_dir = osp.join('./work_dirs',
|
||||
osp.splitext(osp.basename(args.config))[0])
|
||||
if args.resume_from is not None:
|
||||
cfg.resume_from = args.resume_from
|
||||
if args.gpu_ids is not None:
|
||||
cfg.gpu_ids = args.gpu_ids
|
||||
else:
|
||||
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
|
||||
|
||||
if args.autoscale_lr:
|
||||
# apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
|
||||
cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
|
||||
|
||||
# init distributed env first, since logger depends on the dist info.
|
||||
if args.launcher == 'none':
|
||||
distributed = False
|
||||
else:
|
||||
distributed = True
|
||||
init_dist(args.launcher, **cfg.dist_params)
|
||||
|
||||
# create work_dir
|
||||
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
|
||||
# init the logger before other steps
|
||||
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
|
||||
log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
|
||||
logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
|
||||
|
||||
# init the meta dict to record some important information such as
|
||||
# environment info and seed, which will be logged
|
||||
meta = dict()
|
||||
# log env info
|
||||
env_info_dict = collect_env()
|
||||
env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
|
||||
dash_line = '-' * 60 + '\n'
|
||||
logger.info('Environment info:\n' + dash_line + env_info + '\n' +
|
||||
dash_line)
|
||||
meta['env_info'] = env_info
|
||||
|
||||
# log some basic info
|
||||
logger.info(f'Distributed training: {distributed}')
|
||||
logger.info(f'Config:\n{cfg.pretty_text}')
|
||||
|
||||
# set random seeds
|
||||
if args.seed is not None:
|
||||
logger.info(f'Set random seed to {args.seed}, '
|
||||
f'deterministic: {args.deterministic}')
|
||||
set_random_seed(args.seed, deterministic=args.deterministic)
|
||||
cfg.seed = args.seed
|
||||
meta['seed'] = args.seed
|
||||
|
||||
model = build_model(
|
||||
cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
|
||||
|
||||
datasets = [build_dataset(cfg.data.train)]
|
||||
if len(cfg.workflow) == 2:
|
||||
val_dataset = copy.deepcopy(cfg.data.val)
|
||||
val_dataset.pipeline = cfg.data.train.pipeline
|
||||
datasets.append(build_dataset(val_dataset))
|
||||
if cfg.checkpoint_config is not None:
|
||||
# save mmcls version, config file content and class names in
|
||||
# checkpoints as meta data
|
||||
cfg.checkpoint_config.meta = dict(
|
||||
mmcls_version=__version__,
|
||||
config=cfg.pretty_text,
|
||||
CLASSES=datasets[0].CLASSES)
|
||||
# add an attribute for visualization convenience
|
||||
model.CLASSES = datasets[0].CLASSES
|
||||
train_model(
|
||||
model,
|
||||
datasets,
|
||||
cfg,
|
||||
distributed=distributed,
|
||||
validate=(not args.no_validate),
|
||||
timestamp=timestamp,
|
||||
meta=meta)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue