From f3a49b56a4db48b195a6571173a498d866a9a40b Mon Sep 17 00:00:00 2001 From: FreyWang Date: Sat, 11 Sep 2021 14:33:01 +0800 Subject: [PATCH] [Fix] Fix iter bug when resuming checkpoint in distributed train (#866) * [Fix] Fix iter bug when resuming checkpoint in distributed train * fix lint error Signed-off-by: FreyWang --- tools/train.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/train.py b/tools/train.py index 490b3ff5f..f81470537 100644 --- a/tools/train.py +++ b/tools/train.py @@ -7,7 +7,7 @@ import time import mmcv import torch -from mmcv.runner import init_dist +from mmcv.runner import get_dist_info, init_dist from mmcv.utils import Config, DictAction, get_git_hash from mmseg import __version__ @@ -94,6 +94,9 @@ def main(): else: distributed = True init_dist(args.launcher, **cfg.dist_params) + # gpu_ids is used to calculate iter when resuming checkpoint, + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))