[Enhance] Formatting non distributed training and inference and Supporting CPU training. (#42)

* [Docs] update batch size * Fix bug in non-distributed multi-gpu training/testing * support cpu training * update cpu training and testing
2022-03-11 00:25:34 +08:00 · 2022-03-11 00:25:34 +08:00 · a8b528de3c
parent 501a6db963
commit a8b528de3c
9 changed files with 142 additions and 25 deletions
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@ -1,6 +1,7 @@
 ## Test a model

 - single GPU
+- CPU
 - single node multiple GPU
 - multiple node

@ -10,6 +11,10 @@ You can use the following commands to infer a dataset.
 # single-gpu
 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]

+# CPU: disable GPUs and run single-gpu testing script
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+
 # multi-gpu
 ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments]

@ -46,6 +51,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments]

 If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.

+### Train on CPU
+
+The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+**Note**:
+
+We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience.
+
+
 ### Train with multiple GPUs

 ```shell
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@ -1,6 +1,7 @@
 ## Test a model

 - single GPU
+- CPU
 - single node multiple GPU
 - multiple node

@ -10,6 +11,10 @@ You can use the following commands to infer a dataset.
 # single-gpu
 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]

+# CPU: disable GPUs and run single-gpu testing script
+export CUDA_VISIBLE_DEVICES=-1
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+
 # multi-gpu
 ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments]

@ -46,6 +51,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments]

 If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`.

+### Train on CPU
+
+The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+**Note**:
+
+We do not recommend users to use CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience.
+
+
 ### Train with multiple GPUs

 ```shell
--- a/mmfewshot/init.py
+++ b/mmfewshot/init.py
@ -32,7 +32,7 @@ assert (digit_version(mmcv_minimum_version) <= mmcv_version
    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'

 mmdet_minimum_version = '2.16.0'
-mmdet_maximum_version = '2.21.0'
+mmdet_maximum_version = '2.23.0'
 mmdet_version = digit_version(mmdet.__version__)


--- a/mmfewshot/classification/apis/train.py
+++ b/mmfewshot/classification/apis/train.py
@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import warnings
 from typing import Dict, Union

 import torch
@ -22,7 +23,7 @@ def train_model(model: Union[MMDataParallel, MMDistributedDataParallel],
                distributed: bool = False,
                validate: bool = False,
                timestamp: str = None,
-                device: str = 'cuda',
+                device: str = None,
                meta: Dict = None) -> None:
    logger = get_root_logger(log_level=cfg.log_level)

@ -54,13 +55,14 @@ def train_model(model: Union[MMDataParallel, MMDistributedDataParallel],
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
-        if device == 'cuda':
-            model = MMDataParallel(
-                model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
-        elif device == 'cpu':
+        if device == 'cpu':
+            warnings.warn(
+                'The argument `device` is deprecated. To use cpu to train, '
+                'please refers to https://mmclassification.readthedocs.io/en'
+                '/latest/getting_started.html#train-a-model')
            model = model.cpu()
        else:
-            raise ValueError(F'unsupported device name {device}.')
+            model = MMDataParallel(model, device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
--- a/mmfewshot/detection/apis/train.py
+++ b/mmfewshot/detection/apis/train.py
@ -55,8 +55,8 @@ def train_detector(model: nn.Module,
            broadcast_buffers=False,
            find_unused_parameters=find_unused_parameters)
    else:
-        model = MMDataParallel(
-            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        # Please use MMCV >= 1.4.4 for CPU training!
+        model = MMDataParallel(model, device_ids=cfg.gpu_ids)

    # build runner
    optimizer = build_optimizer(model, cfg.optimizer)
--- a/tools/classification/test.py
+++ b/tools/classification/test.py
@ -3,6 +3,7 @@ import argparse
 import os
 import os.path as osp
 import time
+import warnings

 import mmcv
 import torch
@ -64,10 +65,19 @@ def parse_args():
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument('--local_rank', type=int, default=0)
    parser.add_argument(
-        '--device',
-        choices=['cpu', 'cuda'],
-        default='cuda',
-        help='device used for testing')
+        '--device', default=None, help='device used for testing. (Deprecated)')
+    parser.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed testing)')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
    parser.add_argument(
        '--show_task_results',
        action='store_true',
@ -75,6 +85,15 @@ def parse_args():
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.device:
+        warnings.warn(
+            '--device is deprecated. To use cpu to test, please '
+            'refers to https://mmclassification.readthedocs.io/en/latest/'
+            'getting_started.html#inference-with-pretrained-models')
+
+    assert args.metrics or args.out, \
+        'Please specify at least one of output path and evaluation metrics.'
    return args


@ -96,7 +115,14 @@ def main():
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])
-
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed testing. Use the first GPU '
+                      'in `gpu_ids` now.')
+    else:
+        cfg.gpu_ids = [args.gpu_id]
    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
--- a/tools/classification/train.py
+++ b/tools/classification/train.py
@ -4,6 +4,7 @@ import copy
 import os
 import os.path as osp
 import time
+import warnings

 import cv2
 import mmcv
@ -45,7 +46,13 @@ def parse_args():
        '--gpu-ids',
        type=int,
        nargs='+',
-        help='ids of gpus to use '
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
        '(only applicable to non-distributed training)')
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    parser.add_argument(
@ -87,10 +94,19 @@ def main():
                                osp.splitext(osp.basename(args.config))[0])
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
    if args.gpu_ids is not None:
-        cfg.gpu_ids = args.gpu_ids
-    else:
-        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
--- a/tools/detection/test.py
+++ b/tools/detection/test.py
@ -27,6 +27,18 @@ def parse_args():
        nargs='+',
        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument(
        '--show-dir', help='directory where painted images will be saved')
@ -116,7 +128,14 @@ def main():
    # currently only support single images testing
    samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
    assert samples_per_gpu == 1, 'currently only support single images testing'
-
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed testing. Use the first GPU '
+                      'in `gpu_ids` now.')
+    else:
+        cfg.gpu_ids = [args.gpu_id]
    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
@ -176,7 +195,8 @@ def main():
            shuffle=False)

    if not distributed:
-        model = MMDataParallel(model, device_ids=[0])
+        # Please use MMCV >= 1.4.4 for CPU testing!
+        model = MMDataParallel(model, device_ids=cfg.gpu_ids)
        show_kwargs = dict(show_score_thr=args.show_score_thr)
        if cfg.data.get('model_init', None) is not None:
            from mmfewshot.detection.apis import (single_gpu_model_init,
--- a/tools/detection/train.py
+++ b/tools/detection/train.py
@ -48,8 +48,14 @@ def parse_args():
        '--gpu-ids',
        type=int,
        nargs='+',
-        help='ids of gpus to use '
+        help='(Deprecated, please use --gpu-id) ids of gpus to use '
        '(only applicable to non-distributed training)')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='id of gpu to use '
+        '(only applicable to non-distributed testing)')
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    parser.add_argument(
        '--deterministic',
@ -119,15 +125,24 @@ def main():
                                osp.splitext(osp.basename(args.config))[0])
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
+    if args.gpus is not None:
+        cfg.gpu_ids = range(1)
+        warnings.warn('`--gpus` is deprecated because we only support '
+                      'single GPU mode in non-distributed training. '
+                      'Use `gpus=1` now.')
    if args.gpu_ids is not None:
-        cfg.gpu_ids = args.gpu_ids
-    else:
-        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+        cfg.gpu_ids = args.gpu_ids[0:1]
+        warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
+                      'Because we only support single GPU mode in '
+                      'non-distributed training. Use the first GPU '
+                      'in `gpu_ids` now.')
+    if args.gpus is None and args.gpu_ids is None:
+        cfg.gpu_ids = [args.gpu_id]

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
-        rank, world_size = get_dist_info()
+        rank = 0
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)