mirror of
https://github.com/open-mmlab/mmengine.git
synced 2025-06-03 21:54:44 +08:00
[Fix] Fix some code and docstring in LogProcessor and LoggerHook (#213)
* fix logger hook * fix to record multi-level logs, log train/a/b to a/b * fix loggerhook json path * fix uunit test * change runner.train_dataloader to runner.train_loop.dataloader * clean debug code * refie comments and docstring * fix unit test
This commit is contained in:
parent
5c5c03e648
commit
859f4d1580
@ -20,7 +20,7 @@ class LoggerHook(Hook):
|
||||
|
||||
``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during
|
||||
training/validation/testing phase. It is used to control following
|
||||
behaviers:
|
||||
behaviors:
|
||||
|
||||
- The frequency of logs update in terminal, local, tensorboad wandb.etc.
|
||||
- The frequency of show experiment information in terminal.
|
||||
@ -41,10 +41,10 @@ class LoggerHook(Hook):
|
||||
of ``out_dir`` and the last level directory of
|
||||
``runner.work_dir``. For example, if the input ``our_dir`` is
|
||||
``./tmp`` and ``runner.work_dir`` is ``./work_dir/cur_exp``,
|
||||
then the log will be saved in ``./tmp/cur_exp``. Deafule to None.
|
||||
out_suffix (Tuple[str] or str): Those filenames ending with
|
||||
``out_suffix`` will be copied to ``out_dir``. Defaults to
|
||||
('.log.json', '.log', '.py').
|
||||
then the log will be saved in ``./tmp/cur_exp``. Defaults to None.
|
||||
out_suffix (Tuple[str] or str): Those files in ``runner._log_dir``
|
||||
ending with ``out_suffix`` will be copied to ``out_dir``. Defaults
|
||||
to ('json', '.log', '.py').
|
||||
keep_local (bool): Whether to keep local logs in the local machine
|
||||
when :attr:`out_dir` is specified. If False, the local log will be
|
||||
removed. Defaults to True.
|
||||
@ -53,7 +53,7 @@ class LoggerHook(Hook):
|
||||
Defaults to None.
|
||||
|
||||
Examples:
|
||||
>>> # A simplest LoggerHook config.
|
||||
>>> # The simplest LoggerHook config.
|
||||
>>> logger_hook_cfg = dict(interval=20)
|
||||
"""
|
||||
priority = 'BELOW_NORMAL'
|
||||
@ -64,7 +64,8 @@ class LoggerHook(Hook):
|
||||
ignore_last: bool = True,
|
||||
interval_exp_name: int = 1000,
|
||||
out_dir: Optional[Union[str, Path]] = None,
|
||||
out_suffix: Union[Sequence[str], str] = ('.log.json', '.log', '.py'),
|
||||
out_suffix: Union[Sequence[str],
|
||||
str] = ('.json', '.log', '.py', 'yaml'),
|
||||
keep_local: bool = True,
|
||||
file_client_args: Optional[dict] = None,
|
||||
):
|
||||
@ -86,6 +87,7 @@ class LoggerHook(Hook):
|
||||
|
||||
self.keep_local = keep_local
|
||||
self.file_client_args = file_client_args
|
||||
self.json_log_path: Optional[str] = None
|
||||
if self.out_dir is not None:
|
||||
self.file_client = FileClient.infer_client(file_client_args,
|
||||
self.out_dir)
|
||||
@ -106,36 +108,32 @@ class LoggerHook(Hook):
|
||||
(f'Text logs will be saved to {self.out_dir} by '
|
||||
f'{self.file_client.name} after the training process.'))
|
||||
|
||||
self.json_log_path = osp.join(runner.work_dir,
|
||||
f'{runner.timestamp}.log.json')
|
||||
self.yaml_log_path = osp.join(runner.work_dir,
|
||||
f'{runner.timestamp}.log.json')
|
||||
self.json_log_path = f'{runner.timestamp}.json'
|
||||
|
||||
def after_train_iter(self,
|
||||
runner,
|
||||
batch_idx: int,
|
||||
data_batch: DATA_BATCH = None,
|
||||
outputs: Optional[dict] = None) -> None:
|
||||
"""Record training logs after training iteration.
|
||||
"""Record logs after training iteration.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
batch_idx (int): The index of the current batch in the train loop.
|
||||
data_batch (Sequence[dict], optional): Data from dataloader.
|
||||
Defaults to None.
|
||||
outputs (dict, optional): Outputs from model.
|
||||
Defaults to None.
|
||||
outputs (dict, optional): Outputs from model. Defaults to None.
|
||||
"""
|
||||
# Print experiment name every n iterations.
|
||||
if self.every_n_iters(runner,
|
||||
self.interval_exp_name) or (self.end_of_epoch(
|
||||
runner.train_dataloader, batch_idx)):
|
||||
runner.train_loop.dataloader, batch_idx)):
|
||||
exp_info = f'Exp name: {runner.experiment_name}'
|
||||
runner.logger.info(exp_info)
|
||||
if self.every_n_inner_iters(batch_idx, self.interval):
|
||||
tag, log_str = runner.log_processor.get_log_after_iter(
|
||||
runner, batch_idx, 'train')
|
||||
elif (self.end_of_epoch(runner.train_dataloader, batch_idx)
|
||||
elif (self.end_of_epoch(runner.train_loop.dataloader, batch_idx)
|
||||
and not self.ignore_last):
|
||||
# `runner.max_iters` may not be divisible by `self.interval`. if
|
||||
# `self.ignore_last==True`, the log of remaining iterations will
|
||||
@ -146,8 +144,8 @@ class LoggerHook(Hook):
|
||||
else:
|
||||
return
|
||||
runner.logger.info(log_str)
|
||||
# TODO compatible with visualizer.
|
||||
runner.visualizer.add_scalars(tag, step=runner.iter + 1)
|
||||
runner.visualizer.add_scalars(
|
||||
tag, step=runner.iter + 1, file_path=self.json_log_path)
|
||||
|
||||
def after_val_iter(
|
||||
self,
|
||||
@ -155,17 +153,18 @@ class LoggerHook(Hook):
|
||||
batch_idx: int,
|
||||
data_batch: DATA_BATCH = None,
|
||||
outputs: Optional[Sequence[BaseDataElement]] = None) -> None:
|
||||
"""Record validation logs after validation iteration.
|
||||
"""Record logs after validation iteration.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
batch_idx (int): The index of the current batch in the train loop.
|
||||
data_batch (Sequence[Tuple[Any, BaseDataElement]], optional):
|
||||
Data from dataloader. Defaults to None.
|
||||
runner (Runner): The runner of the validation process.
|
||||
batch_idx (int): The index of the current batch in the validation
|
||||
loop.
|
||||
data_batch (Sequence[dict], optional): Data from dataloader.
|
||||
Defaults to None.
|
||||
outputs (sequence, optional): Outputs from model. Defaults to None.
|
||||
"""
|
||||
if self.every_n_inner_iters(batch_idx, self.interval):
|
||||
tag, log_str = runner.log_processor.get_log_after_iter(
|
||||
_, log_str = runner.log_processor.get_log_after_iter(
|
||||
runner, batch_idx, 'val')
|
||||
runner.logger.info(log_str)
|
||||
|
||||
@ -175,39 +174,39 @@ class LoggerHook(Hook):
|
||||
batch_idx: int,
|
||||
data_batch: DATA_BATCH = None,
|
||||
outputs: Optional[Sequence[BaseDataElement]] = None) -> None:
|
||||
"""Record testing logs after iteration.
|
||||
"""Record logs after testing iteration.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
batch_idx (int): The index of the current batch in the train loop.
|
||||
data_batch (Sequence[Tuple[Any, BaseDataElement]], optional):
|
||||
Data from dataloader. Defaults to None.
|
||||
runner (Runner): The runner of the testing process.
|
||||
batch_idx (int): The index of the current batch in the test loop.
|
||||
data_batch (Sequence[dict], optional): Data from dataloader.
|
||||
Defaults to None.
|
||||
outputs (sequence, optional): Outputs from model. Defaults to None.
|
||||
"""
|
||||
if self.every_n_inner_iters(batch_idx, self.interval):
|
||||
tag, log_str = runner.log_processor.get_log_after_iter(
|
||||
_, log_str = runner.log_processor.get_log_after_iter(
|
||||
runner, batch_idx, 'test')
|
||||
runner.logger.info(log_str)
|
||||
|
||||
def after_val_epoch(self, runner) -> None:
|
||||
"""Record validation logs after validation epoch.
|
||||
"""Record logs after validation epoch.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
runner (Runner): The runner of the validation process.
|
||||
"""
|
||||
tag, log_str = runner.log_processor.get_log_after_epoch(
|
||||
runner, len(runner.val_dataloader), 'val')
|
||||
runner.logger.info(log_str)
|
||||
# TODO compatible with visualizer.
|
||||
runner.visualizer.add_scalars(tag, step=runner.iter + 1)
|
||||
runner.visualizer.add_scalars(
|
||||
tag, step=runner.iter, file_path=self.json_log_path)
|
||||
|
||||
def after_test_epoch(self, runner) -> None:
|
||||
"""Record testing logs after test epoch.
|
||||
"""Record logs after testing epoch.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
runner (Runner): The runner of the testing process.
|
||||
"""
|
||||
tag, log_str = runner.log_processor.get_log_after_epoch(
|
||||
_, log_str = runner.log_processor.get_log_after_epoch(
|
||||
runner, len(runner.val_dataloader), 'test')
|
||||
runner.logger.info(log_str)
|
||||
|
||||
@ -215,13 +214,14 @@ class LoggerHook(Hook):
|
||||
"""Copy logs to ``self.out_dir`` if ``self.out_dir is not None``
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
runner (Runner): The runner of the training/testing/validation
|
||||
process.
|
||||
"""
|
||||
# copy or upload logs to self.out_dir
|
||||
if self.out_dir is None:
|
||||
return
|
||||
for filename in scandir(runner.work_dir, self.out_suffix, True):
|
||||
local_filepath = osp.join(runner.work_dir, filename)
|
||||
for filename in scandir(runner._log_dir, self.out_suffix, True):
|
||||
local_filepath = osp.join(runner._log_dir, filename)
|
||||
out_filepath = self.file_client.join_path(self.out_dir, filename)
|
||||
with open(local_filepath, 'r') as f:
|
||||
self.file_client.put_text(f.read(), out_filepath)
|
||||
|
@ -181,7 +181,7 @@ class LogProcessor:
|
||||
"""Format log string after validation or testing epoch.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of training phase.
|
||||
runner (Runner): The runner of validation/testing phase.
|
||||
batch_idx (int): The index of the current batch in the current
|
||||
loop.
|
||||
mode (str): Current mode of runner.
|
||||
@ -200,10 +200,12 @@ class LogProcessor:
|
||||
custom_cfg_copy = self._parse_windows_size(runner, batch_idx)
|
||||
# tag is used to write log information to different backends.
|
||||
tag = self._collect_scalars(custom_cfg_copy, runner, mode)
|
||||
# validation log string needs cur epoch/iteration and max
|
||||
# epochs/iterations. test log string only needs length of test
|
||||
# dataloader.
|
||||
cur_iter = self._get_iter(runner, batch_idx)
|
||||
# By epoch:
|
||||
# Epoch(val) [10][1000/1000] ...
|
||||
# Epoch(test) [1000/1000] ...
|
||||
# By iteration:
|
||||
# Iteration(val) [1000/1000] ...
|
||||
# Iteration(test) [1000/1000] ...
|
||||
if self.by_epoch:
|
||||
if mode == 'val':
|
||||
cur_epoch = self._get_epoch(runner, mode)
|
||||
@ -214,12 +216,9 @@ class LogProcessor:
|
||||
f'Epoch({mode}) [{dataloader_len}/{dataloader_len}] ')
|
||||
|
||||
else:
|
||||
if mode == 'train':
|
||||
log_str = (f'Iter({mode}) [{cur_iter}/'
|
||||
f'{runner.train_loop.max_iters}] ')
|
||||
else:
|
||||
log_str = (
|
||||
f'Iter({mode}) [{dataloader_len}/{dataloader_len}] ')
|
||||
log_str = (f'Iter({mode}) [{dataloader_len}/{dataloader_len}] ')
|
||||
# `time` and `data_time` will not be recorded in after epoch log
|
||||
# message.
|
||||
log_items = []
|
||||
for name, val in tag.items():
|
||||
if name in ('time', 'data_time'):
|
||||
@ -237,9 +236,9 @@ class LogProcessor:
|
||||
Args:
|
||||
custom_cfg (List[dict]): A copy of ``self.custom_cfg`` with int
|
||||
``window_size``.
|
||||
runner (Runner): The runner of the training process.
|
||||
mode (str): 'train' or 'val', which means the prefix attached by
|
||||
runner.
|
||||
runner (Runner): The runner of the training/testing/validation
|
||||
process.
|
||||
mode (str): Current mode of runner.
|
||||
|
||||
Returns:
|
||||
dict: Statistical values of logs.
|
||||
@ -253,7 +252,7 @@ class LogProcessor:
|
||||
# according to mode.
|
||||
for prefix_key, log_buffer in history_scalars.items():
|
||||
if prefix_key.startswith(mode):
|
||||
key = prefix_key.split('/')[-1]
|
||||
key = prefix_key.partition('/')[-1]
|
||||
mode_history_scalars[key] = log_buffer
|
||||
for key in mode_history_scalars:
|
||||
# Update the latest learning rate and smoothed time logs.
|
||||
@ -287,25 +286,23 @@ class LogProcessor:
|
||||
' is False.'
|
||||
|
||||
def _check_repeated_log_name():
|
||||
check_dict = dict()
|
||||
# The `log_name` of the same data_src should not be repeated.
|
||||
# If `log_name` is not specified, `data_src` will be overwritten.
|
||||
# But only allowed to be overwritten once.
|
||||
check_set = set()
|
||||
for log_cfg in self.custom_cfg:
|
||||
assert 'data_src' in log_cfg
|
||||
data_src = log_cfg['data_src']
|
||||
log_name = log_cfg.get('log_name', data_src)
|
||||
check_dict.setdefault(data_src,
|
||||
dict(log_names=set(), log_counts=0))
|
||||
check_dict[data_src]['log_names'].add(log_name)
|
||||
check_dict[data_src]['log_counts'] += 1
|
||||
assert (len(
|
||||
check_dict[data_src]
|
||||
['log_names']) == check_dict[data_src]['log_counts']), (
|
||||
f'If you want to statistic {data_src} with multiple '
|
||||
'statistics method, please check `log_name` is unique'
|
||||
f'and {data_src} will not be overwritten twice. See '
|
||||
f'more information in the docstring of `LogProcessor`')
|
||||
assert log_name not in check_set, (
|
||||
f'Found duplicate {log_name} for {data_src}. Please check'
|
||||
'your `custom_cfg` for `log_processor`. You should '
|
||||
f'neither define duplicate `{log_name}` for {data_src} '
|
||||
f'nor do not define any {log_name} for multiple '
|
||||
f'{data_src}, See more information in the docstring of '
|
||||
'LogProcessor')
|
||||
|
||||
check_set.add(log_name)
|
||||
|
||||
_check_repeated_log_name()
|
||||
_check_window_size()
|
||||
@ -314,7 +311,8 @@ class LogProcessor:
|
||||
"""Parse window_size defined in custom_cfg to int value.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
runner (Runner): The runner of the training/testing/validation
|
||||
process.
|
||||
batch_idx (int): The iteration index of current dataloader.
|
||||
"""
|
||||
custom_cfg_copy = copy.deepcopy(self.custom_cfg)
|
||||
@ -337,7 +335,8 @@ class LogProcessor:
|
||||
for a given device.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
runner (Runner): The runner of the training/testing/validation
|
||||
process.
|
||||
|
||||
Returns:
|
||||
The maximum GPU memory occupied by tensors in megabytes for a given
|
||||
@ -352,11 +351,12 @@ class LogProcessor:
|
||||
return int(mem_mb.item())
|
||||
|
||||
def _get_iter(self, runner, batch_idx: int = None) -> int:
|
||||
"""Get current training iteration step.
|
||||
"""Get current iteration index.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training process.
|
||||
batch_idx (int, optional): The interaction index of current
|
||||
runner (Runner): The runner of the training/testing/validation
|
||||
process.
|
||||
batch_idx (int, optional): The iteration index of current
|
||||
dataloader. Defaults to None.
|
||||
|
||||
Returns:
|
||||
@ -372,8 +372,9 @@ class LogProcessor:
|
||||
"""Get current epoch according to mode.
|
||||
|
||||
Args:
|
||||
runner (Runner): The runner of the training/validation process.
|
||||
mode (str): Current mode of runner, "train" or "val".
|
||||
runner (Runner): The runner of the training/testing/validation
|
||||
process.
|
||||
mode (str): Current mode of runner.
|
||||
|
||||
Returns:
|
||||
int: The current epoch.
|
||||
@ -395,7 +396,7 @@ class LogProcessor:
|
||||
Args:
|
||||
runner (Runner): The runner of the training/validation/testing
|
||||
process.
|
||||
mode (str): Current mode of runner, "train", "val" or test.
|
||||
mode (str): Current mode of runner.
|
||||
|
||||
Returns:
|
||||
BaseLoop: Current loop of runner.
|
||||
|
@ -64,7 +64,9 @@ class Runner:
|
||||
Args:
|
||||
model (:obj:`torch.nn.Module` or dict): The model to be run. It can be
|
||||
a dict used for build a model.
|
||||
work_dir (str): The working directory to save checkpoints and logs.
|
||||
work_dir (str): The working directory to save checkpoints. The logs
|
||||
will be saved in the subdirectory of `work_dir` named
|
||||
:attr:`timestamp`.
|
||||
train_dataloader (Dataloader or dict, optional): A dataloader object or
|
||||
a dict to build a dataloader. If ``None`` is given, it means
|
||||
skipping training steps. Defaults to None.
|
||||
@ -637,7 +639,7 @@ class Runner:
|
||||
MMLogger: A MMLogger object build from ``logger``.
|
||||
"""
|
||||
if log_file is None:
|
||||
log_file = osp.join(self._log_dir, f'{self._experiment_name}.log')
|
||||
log_file = osp.join(self._log_dir, f'{self.timestamp}.log')
|
||||
|
||||
log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
|
||||
log_cfg.setdefault('name', self._experiment_name)
|
||||
|
@ -15,7 +15,7 @@ class TestLoggerHook:
|
||||
assert logger_hook.interval == 10
|
||||
assert logger_hook.ignore_last
|
||||
assert logger_hook.interval_exp_name == 1000
|
||||
assert logger_hook.out_suffix == ('.log.json', '.log', '.py')
|
||||
assert logger_hook.out_suffix == ('.json', '.log', '.py', 'yaml')
|
||||
assert logger_hook.keep_local
|
||||
assert logger_hook.file_client_args is None
|
||||
assert isinstance(logger_hook.file_client.client, HardDiskBackend)
|
||||
@ -29,36 +29,42 @@ class TestLoggerHook:
|
||||
def test_before_run(self):
|
||||
runner = MagicMock()
|
||||
runner.iter = 10
|
||||
runner.timestamp = 'timestamp'
|
||||
runner.timestamp = '20220429'
|
||||
runner._log_dir = f'work_dir/{runner.timestamp}'
|
||||
runner.work_dir = 'work_dir'
|
||||
runner.logger = MagicMock()
|
||||
logger_hook = LoggerHook(out_dir='out_dir')
|
||||
logger_hook.before_run(runner)
|
||||
assert logger_hook.out_dir == osp.join('out_dir', 'work_dir')
|
||||
assert logger_hook.json_log_path == osp.join('work_dir',
|
||||
'timestamp.log.json')
|
||||
assert logger_hook.json_log_path == f'{runner.timestamp}.json'
|
||||
|
||||
def test_after_run(self, tmp_path):
|
||||
# Test
|
||||
timestamp = '20220429'
|
||||
out_dir = tmp_path / 'out_dir'
|
||||
out_dir.mkdir()
|
||||
work_dir = tmp_path / 'work_dir'
|
||||
work_dir.mkdir()
|
||||
work_dir_json = work_dir / 'tmp.log.json'
|
||||
log_dir = work_dir / timestamp
|
||||
log_dir.mkdir()
|
||||
log_dir_json = log_dir / 'tmp.log.json'
|
||||
runner = MagicMock()
|
||||
runner.work_dir = work_dir
|
||||
runner._log_dir = str(log_dir)
|
||||
runner.timestamp = timestamp
|
||||
runner.work_dir = str(work_dir)
|
||||
# Test without out_dir.
|
||||
logger_hook = LoggerHook()
|
||||
logger_hook.after_run(runner)
|
||||
# Test with out_dir and make sure json file has been moved to out_dir.
|
||||
json_f = open(work_dir_json, 'w')
|
||||
json_f = open(log_dir_json, 'w')
|
||||
json_f.close()
|
||||
logger_hook = LoggerHook(out_dir=str(tmp_path), keep_local=False)
|
||||
logger_hook = LoggerHook(out_dir=str(out_dir), keep_local=False)
|
||||
logger_hook.out_dir = str(out_dir)
|
||||
logger_hook.before_run(runner)
|
||||
logger_hook.after_run(runner)
|
||||
# Verify that the file has been moved to `out_dir`.
|
||||
assert not osp.exists(str(work_dir_json))
|
||||
assert osp.exists(str(out_dir / 'tmp.log.json'))
|
||||
assert not osp.exists(str(log_dir_json))
|
||||
assert osp.exists(str(out_dir / 'work_dir' / 'tmp.log.json'))
|
||||
|
||||
def test_after_train_iter(self):
|
||||
# Test LoggerHook by iter.
|
||||
@ -89,7 +95,7 @@ class TestLoggerHook:
|
||||
runner.log_processor.get_log_after_iter = MagicMock(
|
||||
return_value=(dict(), 'log_str'))
|
||||
logger_hook = LoggerHook(ignore_last=False)
|
||||
runner.train_dataloader = [0] * 5
|
||||
runner.train_loop.dataloader = [0] * 5
|
||||
logger_hook.after_train_iter(runner, batch_idx=4)
|
||||
runner.log_processor.get_log_after_iter.assert_called()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user