From a8b528de3c3bad64318998315b1c10f267b80e84 Mon Sep 17 00:00:00 2001 From: Linyiqi Date: Fri, 11 Mar 2022 00:25:34 +0800 Subject: [PATCH] [Enhance] Formatting non distributed training and inference and Supporting CPU training. (#42) * [Docs] update batch size * Fix bug in non-distributed multi-gpu training/testing * support cpu training * update cpu training and testing --- docs/en/get_started.md | 19 ++++++++++++++ docs/zh_cn/get_started.md | 19 ++++++++++++++ mmfewshot/__init__.py | 2 +- mmfewshot/classification/apis/train.py | 14 +++++----- mmfewshot/detection/apis/train.py | 4 +-- tools/classification/test.py | 36 ++++++++++++++++++++++---- tools/classification/train.py | 24 ++++++++++++++--- tools/detection/test.py | 24 +++++++++++++++-- tools/detection/train.py | 25 ++++++++++++++---- 9 files changed, 142 insertions(+), 25 deletions(-) diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 2d7ada1..70e1d6a 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -1,6 +1,7 @@ ## Test a model - single GPU +- CPU - single node multiple GPU - multiple node @@ -10,6 +11,10 @@ You can use the following commands to infer a dataset. # single-gpu python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +# CPU: disable GPUs and run single-gpu testing script +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] + # multi-gpu ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments] @@ -46,6 +51,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments] If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`. +### Train on CPU + +The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process. + +```shell +export CUDA_VISIBLE_DEVICES=-1 +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +**Note**: + +We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience. + + ### Train with multiple GPUs ```shell diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md index 2d7ada1..70e1d6a 100644 --- a/docs/zh_cn/get_started.md +++ b/docs/zh_cn/get_started.md @@ -1,6 +1,7 @@ ## Test a model - single GPU +- CPU - single node multiple GPU - multiple node @@ -10,6 +11,10 @@ You can use the following commands to infer a dataset. # single-gpu python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +# CPU: disable GPUs and run single-gpu testing script +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] + # multi-gpu ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments] @@ -46,6 +51,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments] If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`. +### Train on CPU + +The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process. + +```shell +export CUDA_VISIBLE_DEVICES=-1 +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +**Note**: + +We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience. + + ### Train with multiple GPUs ```shell diff --git a/mmfewshot/__init__.py b/mmfewshot/__init__.py index 4190edb..cf4c748 100644 --- a/mmfewshot/__init__.py +++ b/mmfewshot/__init__.py @@ -32,7 +32,7 @@ assert (digit_version(mmcv_minimum_version) <= mmcv_version f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.' mmdet_minimum_version = '2.16.0' -mmdet_maximum_version = '2.21.0' +mmdet_maximum_version = '2.23.0' mmdet_version = digit_version(mmdet.__version__) diff --git a/mmfewshot/classification/apis/train.py b/mmfewshot/classification/apis/train.py index 6c9615c..c3e0905 100644 --- a/mmfewshot/classification/apis/train.py +++ b/mmfewshot/classification/apis/train.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings from typing import Dict, Union import torch @@ -22,7 +23,7 @@ def train_model(model: Union[MMDataParallel, MMDistributedDataParallel], distributed: bool = False, validate: bool = False, timestamp: str = None, - device: str = 'cuda', + device: str = None, meta: Dict = None) -> None: logger = get_root_logger(log_level=cfg.log_level) @@ -54,13 +55,14 @@ def train_model(model: Union[MMDataParallel, MMDistributedDataParallel], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: - if device == 'cuda': - model = MMDataParallel( - model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) - elif device == 'cpu': + if device == 'cpu': + warnings.warn( + 'The argument `device` is deprecated. To use cpu to train, ' + 'please refers to https://mmclassification.readthedocs.io/en' + '/latest/getting_started.html#train-a-model') model = model.cpu() else: - raise ValueError(F'unsupported device name {device}.') + model = MMDataParallel(model, device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) diff --git a/mmfewshot/detection/apis/train.py b/mmfewshot/detection/apis/train.py index ae40c2c..e4e7ef7 100644 --- a/mmfewshot/detection/apis/train.py +++ b/mmfewshot/detection/apis/train.py @@ -55,8 +55,8 @@ def train_detector(model: nn.Module, broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: - model = MMDataParallel( - model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + # Please use MMCV >= 1.4.4 for CPU training! + model = MMDataParallel(model, device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) diff --git a/tools/classification/test.py b/tools/classification/test.py index cecd67f..9d9ee2b 100644 --- a/tools/classification/test.py +++ b/tools/classification/test.py @@ -3,6 +3,7 @@ import argparse import os import os.path as osp import time +import warnings import mmcv import torch @@ -64,10 +65,19 @@ def parse_args(): help='whether to set deterministic options for CUDNN backend.') parser.add_argument('--local_rank', type=int, default=0) parser.add_argument( - '--device', - choices=['cpu', 'cuda'], - default='cuda', - help='device used for testing') + '--device', default=None, help='device used for testing. (Deprecated)') + parser.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='(Deprecated, please use --gpu-id) ids of gpus to use ' + '(only applicable to non-distributed testing)') + parser.add_argument( + '--gpu-id', + type=int, + default=0, + help='id of gpu to use ' + '(only applicable to non-distributed testing)') parser.add_argument( '--show_task_results', action='store_true', @@ -75,6 +85,15 @@ def parse_args(): args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.device: + warnings.warn( + '--device is deprecated. To use cpu to test, please ' + 'refers to https://mmclassification.readthedocs.io/en/latest/' + 'getting_started.html#inference-with-pretrained-models') + + assert args.metrics or args.out, \ + 'Please specify at least one of output path and evaluation metrics.' return args @@ -96,7 +115,14 @@ def main(): # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) - + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids[0:1] + warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' + 'Because we only support single GPU mode in ' + 'non-distributed testing. Use the first GPU ' + 'in `gpu_ids` now.') + else: + cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False diff --git a/tools/classification/train.py b/tools/classification/train.py index 83e5a64..9634875 100644 --- a/tools/classification/train.py +++ b/tools/classification/train.py @@ -4,6 +4,7 @@ import copy import os import os.path as osp import time +import warnings import cv2 import mmcv @@ -45,7 +46,13 @@ def parse_args(): '--gpu-ids', type=int, nargs='+', - help='ids of gpus to use ' + help='(Deprecated, please use --gpu-id) ids of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-id', + type=int, + default=0, + help='id of gpu to use ' '(only applicable to non-distributed training)') parser.add_argument('--seed', type=int, default=None, help='random seed') parser.add_argument( @@ -87,10 +94,19 @@ def main(): osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from + if args.gpus is not None: + cfg.gpu_ids = range(1) + warnings.warn('`--gpus` is deprecated because we only support ' + 'single GPU mode in non-distributed training. ' + 'Use `gpus=1` now.') if args.gpu_ids is not None: - cfg.gpu_ids = args.gpu_ids - else: - cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + cfg.gpu_ids = args.gpu_ids[0:1] + warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' + 'Because we only support single GPU mode in ' + 'non-distributed training. Use the first GPU ' + 'in `gpu_ids` now.') + if args.gpus is None and args.gpu_ids is None: + cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': diff --git a/tools/detection/test.py b/tools/detection/test.py index abfd9f2..4158884 100644 --- a/tools/detection/test.py +++ b/tools/detection/test.py @@ -27,6 +27,18 @@ def parse_args(): nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') + parser.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='(Deprecated, please use --gpu-id) ids of gpus to use ' + '(only applicable to non-distributed training)') + parser.add_argument( + '--gpu-id', + type=int, + default=0, + help='id of gpu to use ' + '(only applicable to non-distributed testing)') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where painted images will be saved') @@ -116,7 +128,14 @@ def main(): # currently only support single images testing samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) assert samples_per_gpu == 1, 'currently only support single images testing' - + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids[0:1] + warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' + 'Because we only support single GPU mode in ' + 'non-distributed testing. Use the first GPU ' + 'in `gpu_ids` now.') + else: + cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False @@ -176,7 +195,8 @@ def main(): shuffle=False) if not distributed: - model = MMDataParallel(model, device_ids=[0]) + # Please use MMCV >= 1.4.4 for CPU testing! + model = MMDataParallel(model, device_ids=cfg.gpu_ids) show_kwargs = dict(show_score_thr=args.show_score_thr) if cfg.data.get('model_init', None) is not None: from mmfewshot.detection.apis import (single_gpu_model_init, diff --git a/tools/detection/train.py b/tools/detection/train.py index fa0c28a..09d83b1 100644 --- a/tools/detection/train.py +++ b/tools/detection/train.py @@ -48,8 +48,14 @@ def parse_args(): '--gpu-ids', type=int, nargs='+', - help='ids of gpus to use ' + help='(Deprecated, please use --gpu-id) ids of gpus to use ' '(only applicable to non-distributed training)') + parser.add_argument( + '--gpu-id', + type=int, + default=0, + help='id of gpu to use ' + '(only applicable to non-distributed testing)') parser.add_argument('--seed', type=int, default=None, help='random seed') parser.add_argument( '--deterministic', @@ -119,15 +125,24 @@ def main(): osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from + if args.gpus is not None: + cfg.gpu_ids = range(1) + warnings.warn('`--gpus` is deprecated because we only support ' + 'single GPU mode in non-distributed training. ' + 'Use `gpus=1` now.') if args.gpu_ids is not None: - cfg.gpu_ids = args.gpu_ids - else: - cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + cfg.gpu_ids = args.gpu_ids[0:1] + warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' + 'Because we only support single GPU mode in ' + 'non-distributed training. Use the first GPU ' + 'in `gpu_ids` now.') + if args.gpus is None and args.gpu_ids is None: + cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False - rank, world_size = get_dist_info() + rank = 0 else: distributed = True init_dist(args.launcher, **cfg.dist_params)