[Fix] Fix iter bug when resuming checkpoint in distributed train (#866)

* [Fix] Fix iter bug when resuming checkpoint in distributed train

* fix lint error

Signed-off-by: FreyWang <wangwxyz@qq.com>
This commit is contained in:
FreyWang 2021-09-11 14:33:01 +08:00 committed by GitHub
parent 598f5c7fa8
commit f3a49b56a4

View File

@ -7,7 +7,7 @@ import time
import mmcv import mmcv
import torch import torch
from mmcv.runner import init_dist from mmcv.runner import get_dist_info, init_dist
from mmcv.utils import Config, DictAction, get_git_hash from mmcv.utils import Config, DictAction, get_git_hash
from mmseg import __version__ from mmseg import __version__
@ -94,6 +94,9 @@ def main():
else: else:
distributed = True distributed = True
init_dist(args.launcher, **cfg.dist_params) init_dist(args.launcher, **cfg.dist_params)
# gpu_ids is used to calculate iter when resuming checkpoint,
_, world_size = get_dist_info()
cfg.gpu_ids = range(world_size)
# create work_dir # create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))