Fix bug in gpu_ids in distributed training (#107)

* update gpu_ids in distributed training * move linear scaling rule after getting correct gpu_ids * Remove support for autoscale_lr
2020-12-09 16:28:03 +08:00 · 2020-12-09 16:28:03 +08:00 · f355f15485
parent b1e91f256b
commit f355f15485
1 changed files with 3 additions and 9 deletions
--- a/tools/train.py
+++ b/tools/train.py
@ -7,7 +7,7 @@ import time
 import mmcv
 import torch
 from mmcv import Config, DictAction
-from mmcv.runner import init_dist
+from mmcv.runner import get_dist_info, init_dist

 from mmcls import __version__
 from mmcls.apis import set_random_seed, train_model
@ -51,10 +51,6 @@ def parse_args():
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
-    parser.add_argument(
-        '--autoscale-lr',
-        action='store_true',
-        help='automatically scale lr with the number of gpus')
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)
@ -87,16 +83,14 @@ def main():
    else:
        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)

-    if args.autoscale_lr:
-        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
-        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
-
    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)

    # create work_dir
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))