mirror of https://github.com/open-mmlab/mmocr.git
[Feat] Support specify gpu for testing and training with gpu-id instead of gpu-ids and gpus (#756)
* support set gpu number for testing and deprecate gpu_ids and gpus with gpu_id * update gpu-ids gpus and gpu-id dostring * add args.gpus process * fix typo Co-authored-by: Tong Gao <gaotongxiao@gmail.com>pull/758/head
parent
a809a52c78
commit
e0ff11819b
|
@ -27,20 +27,21 @@ CUDA_VISIBLE_DEVICES= python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [AR
|
|||
|
||||
|
||||
|
||||
| ARGS | Type | Description |
|
||||
| -------------- | --------------------- | ----------------------------------------------------------- |
|
||||
| `--out` | str | Output result file in pickle format. |
|
||||
| `--fuse-conv-bn` | bool | Path to the custom config of the selected det model. |
|
||||
| `--format-only` | bool | Format the output results without performing evaluation. It is useful when you want to format the results to a specific format and submit them to the test server.|
|
||||
| `--eval` | 'hmean-ic13', 'hmean-iou', 'acc' | The evaluation metrics, which depends on the task. For text detection, the metric should be either 'hmean-ic13' or 'hmean-iou'. For text recognition, the metric should be 'acc'. |
|
||||
| `--show` | bool | Whether to show results. |
|
||||
| `--show-dir` | str | Directory where the output images will be saved. |
|
||||
| `--show-score-thr` | float | Score threshold (default: 0.3). |
|
||||
| `--gpu-collect` | bool | Whether to use gpu to collect results. |
|
||||
| `--tmpdir` | str | The tmp directory used for collecting results from multiple workers, available when gpu-collect is not specified. |
|
||||
| `--cfg-options` | str | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either key="[a,b]" or key=a,b. The argument also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]". Note that the quotation marks are necessary and that no white space is allowed.|
|
||||
| `--eval-options` | str |Custom options for evaluation, the key-value pair in xxx=yyy format will be kwargs for dataset.evaluate() function.|
|
||||
| `--launcher` | 'none', 'pytorch', 'slurm', 'mpi' | Options for job launcher. |
|
||||
| ARGS | Type | Description |
|
||||
| ------------------ | --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `--out` | str | Output result file in pickle format. |
|
||||
| `--fuse-conv-bn` | bool | Path to the custom config of the selected det model. |
|
||||
| `--format-only` | bool | Format the output results without performing evaluation. It is useful when you want to format the results to a specific format and submit them to the test server. |
|
||||
| `--gpu-id` | int | GPU id to use. Only applicable to non-distributed training. |
|
||||
| `--eval` | 'hmean-ic13', 'hmean-iou', 'acc' | The evaluation metrics, which depends on the task. For text detection, the metric should be either 'hmean-ic13' or 'hmean-iou'. For text recognition, the metric should be 'acc'. |
|
||||
| `--show` | bool | Whether to show results. |
|
||||
| `--show-dir` | str | Directory where the output images will be saved. |
|
||||
| `--show-score-thr` | float | Score threshold (default: 0.3). |
|
||||
| `--gpu-collect` | bool | Whether to use gpu to collect results. |
|
||||
| `--tmpdir` | str | The tmp directory used for collecting results from multiple workers, available when gpu-collect is not specified. |
|
||||
| `--cfg-options` | str | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either key="[a,b]" or key=a,b. The argument also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]". Note that the quotation marks are necessary and that no white space is allowed. |
|
||||
| `--eval-options` | str | Custom options for evaluation, the key-value pair in xxx=yyy format will be kwargs for dataset.evaluate() function. |
|
||||
| `--launcher` | 'none', 'pytorch', 'slurm', 'mpi' | Options for job launcher. |
|
||||
|
||||
|
||||
## Testing with Multiple GPUs
|
||||
|
@ -54,10 +55,10 @@ You can use the following command to test a dataset with multiple GPUs.
|
|||
```
|
||||
|
||||
|
||||
| Arguments | Type | Description |
|
||||
| -------------- | --------------------- | ----------------------------------------------------------- |
|
||||
| `PORT` | int | The master port that will be used by the machine with rank 0. Defaults to 29500. |
|
||||
| `PY_ARGS` | str | Arguments to be parsed by `tools/test.py`. |
|
||||
| Arguments | Type | Description |
|
||||
| --------- | ---- | -------------------------------------------------------------------------------- |
|
||||
| `PORT` | int | The master port that will be used by the machine with rank 0. Defaults to 29500. |
|
||||
| `PY_ARGS` | str | Arguments to be parsed by `tools/test.py`. |
|
||||
|
||||
|
||||
For example,
|
||||
|
@ -75,12 +76,12 @@ If you run MMOCR on a cluster managed with [Slurm](https://slurm.schedmd.com/),
|
|||
[GPUS=${GPUS}] [GPUS_PER_NODE=${GPUS_PER_NODE}] [SRUN_ARGS=${SRUN_ARGS}] ./tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${CHECKPOINT_FILE} [PY_ARGS]
|
||||
```
|
||||
|
||||
| Arguments | Type | Description |
|
||||
| -------------- | --------------------- | ----------------------------------------------------------- |
|
||||
| `GPUS` | int | The number of GPUs to be used by this task. Defaults to 8. |
|
||||
| `GPUS_PER_NODE` | int | The number of GPUs to be allocated per node. Defaults to 8. |
|
||||
| `SRUN_ARGS` | str | Arguments to be parsed by srun. Available options can be found [here](https://slurm.schedmd.com/srun.html). |
|
||||
| `PY_ARGS` | str | Arguments to be parsed by `tools/test.py`. |
|
||||
| Arguments | Type | Description |
|
||||
| --------------- | ---- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `GPUS` | int | The number of GPUs to be used by this task. Defaults to 8. |
|
||||
| `GPUS_PER_NODE` | int | The number of GPUs to be allocated per node. Defaults to 8. |
|
||||
| `SRUN_ARGS` | str | Arguments to be parsed by srun. Available options can be found [here](https://slurm.schedmd.com/srun.html). |
|
||||
| `PY_ARGS` | str | Arguments to be parsed by `tools/test.py`. |
|
||||
|
||||
|
||||
Here is an example of using 8 GPUs to test an example model on the 'dev' partition with job name 'test_job'.
|
||||
|
|
|
@ -19,22 +19,21 @@ CUDA_VISIBLE_DEVICES= python tools/train.py ${CONFIG_FILE} [ARGS]
|
|||
|
||||
:::
|
||||
|
||||
|
||||
|
||||
| ARGS | Type | Description |
|
||||
| -------------- | --------------------- | ----------------------------------------------------------- |
|
||||
| `--work-dir` | str | The target folder to save logs and checkpoints. Defaults to `./work_dirs`. |
|
||||
| `--load-from` | str | The checkpoint file to load from. |
|
||||
| `--resume-from` | bool | The checkpoint file to resume the training from.|
|
||||
| `--no-validate` | bool | Disable checkpoint evaluation during training. Defaults to `False`. |
|
||||
| `--gpus` | int | Numbers of gpus to use. Only applicable to non-distributed GPU training. |
|
||||
| `--gpu-ids` | int*N | A list of GPU ids to use. Only applicable to non-distributed GPU training. |
|
||||
| `--seed` | int | Random seed. |
|
||||
| `--deterministic` | bool | Whether to set deterministic options for CUDNN backend. |
|
||||
| `--cfg-options` | str | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either key="[a,b]" or key=a,b. The argument also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]". Note that the quotation marks are necessary and that no white space is allowed.|
|
||||
| `--launcher` | 'none', 'pytorch', 'slurm', 'mpi' | Options for job launcher. |
|
||||
| `--local_rank` | int |Used for distributed training.|
|
||||
| `--mc-config` | str |Memory cache config for image loading speed-up during training.|
|
||||
| ARGS | Type | Description |
|
||||
| ----------------- | --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `--work-dir` | str | The target folder to save logs and checkpoints. Defaults to `./work_dirs`. |
|
||||
| `--load-from` | str | The checkpoint file to load from. |
|
||||
| `--resume-from` | bool | The checkpoint file to resume the training from. |
|
||||
| `--no-validate` | bool | Disable checkpoint evaluation during training. Defaults to `False`. |
|
||||
| `--gpus` | int | **Deprecated, please use --gpu-id.** Numbers of gpus to use. Only applicable to non-distributed training. |
|
||||
| `--gpu-ids` | int*N | **Deprecated, please use --gpu-id.** A list of GPU ids to use. Only applicable to non-distributed training. |
|
||||
| `--gpu-id` | int | The GPU id to use. Only applicable to non-distributed training. |
|
||||
| `--seed` | int | Random seed. |
|
||||
| `--deterministic` | bool | Whether to set deterministic options for CUDNN backend. |
|
||||
| `--cfg-options` | str | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either key="[a,b]" or key=a,b. The argument also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]". Note that the quotation marks are necessary and that no white space is allowed. |
|
||||
| `--launcher` | 'none', 'pytorch', 'slurm', 'mpi' | Options for job launcher. |
|
||||
| `--local_rank` | int | Used for distributed training. |
|
||||
| `--mc-config` | str | Memory cache config for image loading speed-up during training. |
|
||||
|
||||
## Training on Multiple Machines
|
||||
|
||||
|
@ -44,10 +43,10 @@ MMOCR implements **distributed** training with `MMDistributedDataParallel`. (Ple
|
|||
[PORT={PORT}] ./tools/dist_train.sh ${CONFIG_FILE} ${WORK_DIR} ${GPU_NUM} [PY_ARGS]
|
||||
```
|
||||
|
||||
| Arguments | Type | Description |
|
||||
| -------------- | --------------------- | ----------------------------------------------------------- |
|
||||
| `PORT` | int | The master port that will be used by the machine with rank 0. Defaults to 29500. **Note:** If you are launching multiple distrbuted training jobs on a single machine, you need to specify different ports for each job to avoid port conflicts.|
|
||||
| `PY_ARGS` | str | Arguments to be parsed by `tools/train.py`. |
|
||||
| Arguments | Type | Description |
|
||||
| --------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `PORT` | int | The master port that will be used by the machine with rank 0. Defaults to 29500. **Note:** If you are launching multiple distrbuted training jobs on a single machine, you need to specify different ports for each job to avoid port conflicts. |
|
||||
| `PY_ARGS` | str | Arguments to be parsed by `tools/train.py`. |
|
||||
|
||||
|
||||
|
||||
|
@ -59,13 +58,13 @@ If you run MMOCR on a cluster managed with [Slurm](https://slurm.schedmd.com/),
|
|||
[GPUS=${GPUS}] [GPUS_PER_NODE=${GPUS_PER_NODE}] [CPUS_PER_TASK=${CPUS_PER_TASK}] [SRUN_ARGS=${SRUN_ARGS}] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} [PY_ARGS]
|
||||
```
|
||||
|
||||
| Arguments | Type | Description |
|
||||
| -------------- | --------------------- | ----------------------------------------------------------- |
|
||||
| `GPUS` | int | The number of GPUs to be used by this task. Defaults to 8. |
|
||||
| `GPUS_PER_NODE` | int | The number of GPUs to be allocated per node. Defaults to 8. |
|
||||
| `CPUS_PER_TASK` | int | The number of CPUs to be allocated per task. Defaults to 5. |
|
||||
| `SRUN_ARGS` | str | Arguments to be parsed by srun. Available options can be found [here](https://slurm.schedmd.com/srun.html). |
|
||||
| `PY_ARGS` | str | Arguments to be parsed by `tools/train.py`. |
|
||||
| Arguments | Type | Description |
|
||||
| --------------- | ---- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `GPUS` | int | The number of GPUs to be used by this task. Defaults to 8. |
|
||||
| `GPUS_PER_NODE` | int | The number of GPUs to be allocated per node. Defaults to 8. |
|
||||
| `CPUS_PER_TASK` | int | The number of CPUs to be allocated per task. Defaults to 5. |
|
||||
| `SRUN_ARGS` | str | Arguments to be parsed by srun. Available options can be found [here](https://slurm.schedmd.com/srun.html). |
|
||||
| `PY_ARGS` | str | Arguments to be parsed by `tools/train.py`. |
|
||||
|
||||
Here is an example of using 8 GPUs to train a text detection model on the dev partition.
|
||||
|
||||
|
|
|
@ -32,6 +32,12 @@ def parse_args():
|
|||
action='store_true',
|
||||
help='Whether to fuse conv and bn, this will slightly increase'
|
||||
'the inference speed.')
|
||||
parser.add_argument(
|
||||
'--gpu-id',
|
||||
type=int,
|
||||
default=0,
|
||||
help='id of gpu to use '
|
||||
'(only applicable to non-distributed testing)')
|
||||
parser.add_argument(
|
||||
'--format-only',
|
||||
action='store_true',
|
||||
|
@ -152,6 +158,7 @@ def main():
|
|||
|
||||
# init distributed env first, since logger depends on the dist info.
|
||||
if args.launcher == 'none':
|
||||
cfg.gpu_ids = [args.gpu_id]
|
||||
distributed = False
|
||||
else:
|
||||
distributed = True
|
||||
|
@ -195,7 +202,7 @@ def main():
|
|||
model = fuse_conv_bn(model)
|
||||
|
||||
if not distributed:
|
||||
model = MMDataParallel(model, device_ids=[0])
|
||||
model = MMDataParallel(model, device_ids=cfg.gpu_ids)
|
||||
is_kie = cfg.model.type in ['SDMGR']
|
||||
outputs = single_gpu_test(model, data_loader, args.show, args.show_dir,
|
||||
is_kie, args.show_score_thr)
|
||||
|
|
|
@ -37,14 +37,20 @@ def parse_args():
|
|||
group_gpus.add_argument(
|
||||
'--gpus',
|
||||
type=int,
|
||||
help='Number of gpus to use '
|
||||
help='(Deprecated, please use --gpu-id) number of gpus to use '
|
||||
'(only applicable to non-distributed training).')
|
||||
group_gpus.add_argument(
|
||||
'--gpu-ids',
|
||||
type=int,
|
||||
nargs='+',
|
||||
help='ids of gpus to use '
|
||||
'(only applicable to non-distributed training).')
|
||||
help='(Deprecated, please use --gpu-id) ids of gpus to use '
|
||||
'(only applicable to non-distributed training)')
|
||||
group_gpus.add_argument(
|
||||
'--gpu-id',
|
||||
type=int,
|
||||
default=0,
|
||||
help='id of gpu to use '
|
||||
'(only applicable to non-distributed training)')
|
||||
parser.add_argument('--seed', type=int, default=None, help='Random seed.')
|
||||
parser.add_argument(
|
||||
'--deterministic',
|
||||
|
@ -114,10 +120,19 @@ def main():
|
|||
cfg.load_from = args.load_from
|
||||
if args.resume_from is not None:
|
||||
cfg.resume_from = args.resume_from
|
||||
if args.gpus is not None:
|
||||
cfg.gpu_ids = range(1)
|
||||
warnings.warn('`--gpus` is deprecated because we only support '
|
||||
'single GPU mode in non-distributed training. '
|
||||
'Use `gpus=1` now.')
|
||||
if args.gpu_ids is not None:
|
||||
cfg.gpu_ids = args.gpu_ids
|
||||
else:
|
||||
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
|
||||
cfg.gpu_ids = args.gpu_ids[0:1]
|
||||
warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
|
||||
'Because we only support single GPU mode in '
|
||||
'non-distributed training. Use the first GPU '
|
||||
'in `gpu_ids` now.')
|
||||
if args.gpus is None and args.gpu_ids is None:
|
||||
cfg.gpu_ids = [args.gpu_id]
|
||||
|
||||
# init distributed env first, since logger depends on the dist info.
|
||||
if args.launcher == 'none':
|
||||
|
|
Loading…
Reference in New Issue