[Enhance] New-style CPU training and inference. (#674)
* [Enhance] New-style CPU training and inference. * Add version check in CPU training/testpull/679/head
parent
5de480ea9e
commit
8ef5aeda40
|
@ -58,6 +58,7 @@ python demo/image_demo.py demo/demo.JPEG configs/resnet/resnet50_8xb32_in1k.py \
|
|||
### Inference and test a dataset
|
||||
|
||||
- single GPU
|
||||
- CPU
|
||||
- single node multiple GPU
|
||||
- multiple node
|
||||
|
||||
|
@ -67,6 +68,10 @@ You can use the following commands to infer a dataset.
|
|||
# single-gpu
|
||||
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [--out ${RESULT_FILE}]
|
||||
|
||||
# CPU: disable GPUs and run single-gpu testing script
|
||||
export CUDA_VISIBLE_DEVICES=-1
|
||||
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [--out ${RESULT_FILE}]
|
||||
|
||||
# multi-gpu
|
||||
./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--metrics ${METRICS}] [--out ${RESULT_FILE}]
|
||||
|
||||
|
@ -111,6 +116,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments]
|
|||
|
||||
If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.
|
||||
|
||||
### Train with CPU
|
||||
|
||||
The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=-1
|
||||
```
|
||||
|
||||
And then run the script [above](#train-with-a-single-gpu).
|
||||
|
||||
```{warning}
|
||||
The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
|
||||
```
|
||||
|
||||
### Train with multiple GPUs
|
||||
|
||||
```shell
|
||||
|
|
|
@ -58,6 +58,7 @@ python demo/image_demo.py demo/demo.JPEG configs/resnet/resnet50_8xb32_in1k.py \
|
|||
### 数据集的推理与测试
|
||||
|
||||
- 支持单 GPU
|
||||
- 支持 CPU
|
||||
- 支持单节点多 GPU
|
||||
- 支持多节点
|
||||
|
||||
|
@ -67,6 +68,10 @@ python demo/image_demo.py demo/demo.JPEG configs/resnet/resnet50_8xb32_in1k.py \
|
|||
# 单 GPU
|
||||
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [--out ${RESULT_FILE}]
|
||||
|
||||
# CPU: 禁用 GPU 并运行单 GPU 测试脚本
|
||||
export CUDA_VISIBLE_DEVICES=-1
|
||||
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [--out ${RESULT_FILE}]
|
||||
|
||||
# 多 GPU
|
||||
./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--metrics ${METRICS}] [--out ${RESULT_FILE}]
|
||||
|
||||
|
@ -109,6 +114,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments]
|
|||
|
||||
如果用户想在命令中指定工作目录,则需要增加参数 `--work-dir ${YOUR_WORK_DIR}`
|
||||
|
||||
### 使用 CPU 训练
|
||||
|
||||
使用 CPU 训练的流程和使用单 GPU 训练的流程一致,我们仅需要在训练流程开始前禁用 GPU。
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=-1
|
||||
```
|
||||
|
||||
之后运行单 GPU 训练脚本即可。
|
||||
|
||||
```{warning}
|
||||
我们不推荐用户使用 CPU 进行训练,这太过缓慢。我们支持这个功能是为了方便用户在没有 GPU 的机器上进行调试。
|
||||
```
|
||||
|
||||
### 使用多个 GPU 进行训练
|
||||
|
||||
```shell
|
||||
|
@ -148,7 +167,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4
|
|||
CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4
|
||||
```
|
||||
|
||||
如果用户在 slurm 集群下启动多个训练任务,则需要修改配置文件(通常是配置文件的倒数第 6 行)中的 `dist_params` 变量,以设置不同的通信端口。
|
||||
如果用户在 slurm 集群下启动多个训练任务,则需要修改配置文件中的 `dist_params` 变量,以设置不同的通信端口。
|
||||
|
||||
在 `config1.py` 中,
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ def train_model(model,
|
|||
distributed=False,
|
||||
validate=False,
|
||||
timestamp=None,
|
||||
device='cuda',
|
||||
device=None,
|
||||
meta=None):
|
||||
logger = get_root_logger()
|
||||
|
||||
|
@ -122,13 +122,19 @@ def train_model(model,
|
|||
broadcast_buffers=False,
|
||||
find_unused_parameters=find_unused_parameters)
|
||||
else:
|
||||
if device == 'cuda':
|
||||
model = MMDataParallel(
|
||||
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
|
||||
elif device == 'cpu':
|
||||
if device == 'cpu':
|
||||
warnings.warn(
|
||||
'The argument `device` is deprecated. To use cpu to train, '
|
||||
'please refers to https://mmclassification.readthedocs.io/en'
|
||||
'/latest/getting_started.html#train-a-model')
|
||||
model = model.cpu()
|
||||
else:
|
||||
raise ValueError(F'unsupported device name {device}.')
|
||||
model = MMDataParallel(model, device_ids=cfg.gpu_ids)
|
||||
if not model.device_ids:
|
||||
from mmcv import digit_version, __version__
|
||||
assert digit_version(__version__) >= (1, 4, 4), \
|
||||
'To train with CPU, please confirm your mmcv version ' \
|
||||
'is not lower than v1.4.4'
|
||||
|
||||
# build runner
|
||||
optimizer = build_optimizer(model, cfg.optimizer)
|
||||
|
|
|
@ -88,17 +88,20 @@ def parse_args():
|
|||
action=DictAction,
|
||||
help='custom options for show_result. key-value pair in xxx=yyy.'
|
||||
'Check available options in `model.show_result`.')
|
||||
parser.add_argument(
|
||||
'--device', default=None, help='device used for testing. (Deprecated)')
|
||||
parser.add_argument(
|
||||
'--gpu-ids',
|
||||
type=int,
|
||||
nargs='+',
|
||||
help='ids of gpus to use '
|
||||
'(only applicable to non-distributed testing)')
|
||||
parser.add_argument(
|
||||
'--launcher',
|
||||
choices=['none', 'pytorch', 'slurm', 'mpi'],
|
||||
default='none',
|
||||
help='job launcher')
|
||||
parser.add_argument('--local_rank', type=int, default=0)
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
choices=['cpu', 'cuda'],
|
||||
default='cuda',
|
||||
help='device used for testing')
|
||||
args = parser.parse_args()
|
||||
if 'LOCAL_RANK' not in os.environ:
|
||||
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
||||
|
@ -111,6 +114,15 @@ def parse_args():
|
|||
warnings.warn('--options is deprecated in favor of --cfg-options')
|
||||
args.cfg_options = args.options
|
||||
|
||||
if args.device:
|
||||
warnings.warn(
|
||||
'--device is deprecated. To use cpu to test, please '
|
||||
'refers to https://mmclassification.readthedocs.io/en/latest/'
|
||||
'getting_started.html#inference-with-pretrained-models')
|
||||
|
||||
assert args.metrics or args.out, \
|
||||
'Please specify at least one of output path and evaluation metrics.'
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
@ -130,12 +142,19 @@ def main():
|
|||
cfg.model.pretrained = None
|
||||
cfg.data.test.test_mode = True
|
||||
|
||||
assert args.metrics or args.out, \
|
||||
'Please specify at least one of output path and evaluation metrics.'
|
||||
if args.gpu_ids is not None:
|
||||
cfg.gpu_ids = args.gpu_ids
|
||||
else:
|
||||
cfg.gpu_ids = range(1)
|
||||
|
||||
# init distributed env first, since logger depends on the dist info.
|
||||
if args.launcher == 'none':
|
||||
distributed = False
|
||||
if len(cfg.gpu_ids) > 1:
|
||||
warnings.warn(f'The gpu-ids is reset from {cfg.gpu_ids} to '
|
||||
f'{cfg.gpu_ids[0:1]} to avoid potential error in '
|
||||
'non-distribute testing time.')
|
||||
cfg.gpu_ids = cfg.gpu_ids[0:1]
|
||||
else:
|
||||
distributed = True
|
||||
init_dist(args.launcher, **cfg.dist_params)
|
||||
|
@ -171,7 +190,11 @@ def main():
|
|||
if args.device == 'cpu':
|
||||
model = model.cpu()
|
||||
else:
|
||||
model = MMDataParallel(model, device_ids=[0])
|
||||
model = MMDataParallel(model, device_ids=cfg.gpu_ids)
|
||||
if not model.device_ids:
|
||||
assert mmcv.digit_version(mmcv.__version__) >= (1, 4, 4), \
|
||||
'To test with CPU, please confirm your mmcv version ' \
|
||||
'is not lower than v1.4.4'
|
||||
model.CLASSES = CLASSES
|
||||
show_kwargs = {} if args.show_options is None else args.show_options
|
||||
outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
|
||||
|
|
|
@ -29,7 +29,8 @@ def parse_args():
|
|||
action='store_true',
|
||||
help='whether not to evaluate the checkpoint during training')
|
||||
group_gpus = parser.add_mutually_exclusive_group()
|
||||
group_gpus.add_argument('--device', help='device used for training')
|
||||
group_gpus.add_argument(
|
||||
'--device', help='device used for training. (Deprecated)')
|
||||
group_gpus.add_argument(
|
||||
'--gpus',
|
||||
type=int,
|
||||
|
@ -81,6 +82,12 @@ def parse_args():
|
|||
warnings.warn('--options is deprecated in favor of --cfg-options')
|
||||
args.cfg_options = args.options
|
||||
|
||||
if args.device:
|
||||
warnings.warn(
|
||||
'--device is deprecated. To use cpu to train, please '
|
||||
'refers to https://mmclassification.readthedocs.io/en/latest/'
|
||||
'getting_started.html#train-a-model')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue