[Fix] Fix iter bug when resuming checkpoint in distributed train (#866)

* [Fix] Fix iter bug when resuming checkpoint in distributed train * fix lint error Signed-off-by: FreyWang <wangwxyz@qq.com>
2025-06-03 22:03:48 +08:00 · 2021-09-11 14:33:01 +08:00 · 2021-09-11 14:33:01 +08:00 · f3a49b56a4
commit f3a49b56a4
parent 598f5c7fa8
1 changed files with 4 additions and 1 deletions
--- a/tools/train.py
+++ b/tools/train.py
@ -7,7 +7,7 @@ import time
 import mmcv
 import torch
-from mmcv.runner import init_dist
+from mmcv.runner import get_dist_info, init_dist
 from mmcv.utils import Config, DictAction, get_git_hash
 from mmseg import __version__
@ -94,6 +94,9 @@ def main():
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        # gpu_ids is used to calculate iter when resuming checkpoint,
        _, world_size = get_dist_info()
        cfg.gpu_ids = range(world_size)
    # create work_dir
    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))