[Fix] Fix some code and docstring in LogProcessor and LoggerHook (#213)

* fix logger hook

* fix to record multi-level logs, log train/a/b to a/b

* fix loggerhook json path

* fix uunit test

* change runner.train_dataloader to runner.train_loop.dataloader

* clean debug code

* refie comments and docstring

* fix unit test
This commit is contained in:
Mashiro 2022-05-06 14:33:33 +08:00 committed by GitHub
parent 5c5c03e648
commit 859f4d1580
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 97 additions and 88 deletions

View File

@ -20,7 +20,7 @@ class LoggerHook(Hook):
``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during
training/validation/testing phase. It is used to control following
behaviers:
behaviors:
- The frequency of logs update in terminal, local, tensorboad wandb.etc.
- The frequency of show experiment information in terminal.
@ -41,10 +41,10 @@ class LoggerHook(Hook):
of ``out_dir`` and the last level directory of
``runner.work_dir``. For example, if the input ``our_dir`` is
``./tmp`` and ``runner.work_dir`` is ``./work_dir/cur_exp``,
then the log will be saved in ``./tmp/cur_exp``. Deafule to None.
out_suffix (Tuple[str] or str): Those filenames ending with
``out_suffix`` will be copied to ``out_dir``. Defaults to
('.log.json', '.log', '.py').
then the log will be saved in ``./tmp/cur_exp``. Defaults to None.
out_suffix (Tuple[str] or str): Those files in ``runner._log_dir``
ending with ``out_suffix`` will be copied to ``out_dir``. Defaults
to ('json', '.log', '.py').
keep_local (bool): Whether to keep local logs in the local machine
when :attr:`out_dir` is specified. If False, the local log will be
removed. Defaults to True.
@ -53,7 +53,7 @@ class LoggerHook(Hook):
Defaults to None.
Examples:
>>> # A simplest LoggerHook config.
>>> # The simplest LoggerHook config.
>>> logger_hook_cfg = dict(interval=20)
"""
priority = 'BELOW_NORMAL'
@ -64,7 +64,8 @@ class LoggerHook(Hook):
ignore_last: bool = True,
interval_exp_name: int = 1000,
out_dir: Optional[Union[str, Path]] = None,
out_suffix: Union[Sequence[str], str] = ('.log.json', '.log', '.py'),
out_suffix: Union[Sequence[str],
str] = ('.json', '.log', '.py', 'yaml'),
keep_local: bool = True,
file_client_args: Optional[dict] = None,
):
@ -86,6 +87,7 @@ class LoggerHook(Hook):
self.keep_local = keep_local
self.file_client_args = file_client_args
self.json_log_path: Optional[str] = None
if self.out_dir is not None:
self.file_client = FileClient.infer_client(file_client_args,
self.out_dir)
@ -106,36 +108,32 @@ class LoggerHook(Hook):
(f'Text logs will be saved to {self.out_dir} by '
f'{self.file_client.name} after the training process.'))
self.json_log_path = osp.join(runner.work_dir,
f'{runner.timestamp}.log.json')
self.yaml_log_path = osp.join(runner.work_dir,
f'{runner.timestamp}.log.json')
self.json_log_path = f'{runner.timestamp}.json'
def after_train_iter(self,
runner,
batch_idx: int,
data_batch: DATA_BATCH = None,
outputs: Optional[dict] = None) -> None:
"""Record training logs after training iteration.
"""Record logs after training iteration.
Args:
runner (Runner): The runner of the training process.
batch_idx (int): The index of the current batch in the train loop.
data_batch (Sequence[dict], optional): Data from dataloader.
Defaults to None.
outputs (dict, optional): Outputs from model.
Defaults to None.
outputs (dict, optional): Outputs from model. Defaults to None.
"""
# Print experiment name every n iterations.
if self.every_n_iters(runner,
self.interval_exp_name) or (self.end_of_epoch(
runner.train_dataloader, batch_idx)):
runner.train_loop.dataloader, batch_idx)):
exp_info = f'Exp name: {runner.experiment_name}'
runner.logger.info(exp_info)
if self.every_n_inner_iters(batch_idx, self.interval):
tag, log_str = runner.log_processor.get_log_after_iter(
runner, batch_idx, 'train')
elif (self.end_of_epoch(runner.train_dataloader, batch_idx)
elif (self.end_of_epoch(runner.train_loop.dataloader, batch_idx)
and not self.ignore_last):
# `runner.max_iters` may not be divisible by `self.interval`. if
# `self.ignore_last==True`, the log of remaining iterations will
@ -146,8 +144,8 @@ class LoggerHook(Hook):
else:
return
runner.logger.info(log_str)
# TODO compatible with visualizer.
runner.visualizer.add_scalars(tag, step=runner.iter + 1)
runner.visualizer.add_scalars(
tag, step=runner.iter + 1, file_path=self.json_log_path)
def after_val_iter(
self,
@ -155,17 +153,18 @@ class LoggerHook(Hook):
batch_idx: int,
data_batch: DATA_BATCH = None,
outputs: Optional[Sequence[BaseDataElement]] = None) -> None:
"""Record validation logs after validation iteration.
"""Record logs after validation iteration.
Args:
runner (Runner): The runner of the training process.
batch_idx (int): The index of the current batch in the train loop.
data_batch (Sequence[Tuple[Any, BaseDataElement]], optional):
Data from dataloader. Defaults to None.
runner (Runner): The runner of the validation process.
batch_idx (int): The index of the current batch in the validation
loop.
data_batch (Sequence[dict], optional): Data from dataloader.
Defaults to None.
outputs (sequence, optional): Outputs from model. Defaults to None.
"""
if self.every_n_inner_iters(batch_idx, self.interval):
tag, log_str = runner.log_processor.get_log_after_iter(
_, log_str = runner.log_processor.get_log_after_iter(
runner, batch_idx, 'val')
runner.logger.info(log_str)
@ -175,39 +174,39 @@ class LoggerHook(Hook):
batch_idx: int,
data_batch: DATA_BATCH = None,
outputs: Optional[Sequence[BaseDataElement]] = None) -> None:
"""Record testing logs after iteration.
"""Record logs after testing iteration.
Args:
runner (Runner): The runner of the training process.
batch_idx (int): The index of the current batch in the train loop.
data_batch (Sequence[Tuple[Any, BaseDataElement]], optional):
Data from dataloader. Defaults to None.
runner (Runner): The runner of the testing process.
batch_idx (int): The index of the current batch in the test loop.
data_batch (Sequence[dict], optional): Data from dataloader.
Defaults to None.
outputs (sequence, optional): Outputs from model. Defaults to None.
"""
if self.every_n_inner_iters(batch_idx, self.interval):
tag, log_str = runner.log_processor.get_log_after_iter(
_, log_str = runner.log_processor.get_log_after_iter(
runner, batch_idx, 'test')
runner.logger.info(log_str)
def after_val_epoch(self, runner) -> None:
"""Record validation logs after validation epoch.
"""Record logs after validation epoch.
Args:
runner (Runner): The runner of the training process.
runner (Runner): The runner of the validation process.
"""
tag, log_str = runner.log_processor.get_log_after_epoch(
runner, len(runner.val_dataloader), 'val')
runner.logger.info(log_str)
# TODO compatible with visualizer.
runner.visualizer.add_scalars(tag, step=runner.iter + 1)
runner.visualizer.add_scalars(
tag, step=runner.iter, file_path=self.json_log_path)
def after_test_epoch(self, runner) -> None:
"""Record testing logs after test epoch.
"""Record logs after testing epoch.
Args:
runner (Runner): The runner of the training process.
runner (Runner): The runner of the testing process.
"""
tag, log_str = runner.log_processor.get_log_after_epoch(
_, log_str = runner.log_processor.get_log_after_epoch(
runner, len(runner.val_dataloader), 'test')
runner.logger.info(log_str)
@ -215,13 +214,14 @@ class LoggerHook(Hook):
"""Copy logs to ``self.out_dir`` if ``self.out_dir is not None``
Args:
runner (Runner): The runner of the training process.
runner (Runner): The runner of the training/testing/validation
process.
"""
# copy or upload logs to self.out_dir
if self.out_dir is None:
return
for filename in scandir(runner.work_dir, self.out_suffix, True):
local_filepath = osp.join(runner.work_dir, filename)
for filename in scandir(runner._log_dir, self.out_suffix, True):
local_filepath = osp.join(runner._log_dir, filename)
out_filepath = self.file_client.join_path(self.out_dir, filename)
with open(local_filepath, 'r') as f:
self.file_client.put_text(f.read(), out_filepath)

View File

@ -181,7 +181,7 @@ class LogProcessor:
"""Format log string after validation or testing epoch.
Args:
runner (Runner): The runner of training phase.
runner (Runner): The runner of validation/testing phase.
batch_idx (int): The index of the current batch in the current
loop.
mode (str): Current mode of runner.
@ -200,10 +200,12 @@ class LogProcessor:
custom_cfg_copy = self._parse_windows_size(runner, batch_idx)
# tag is used to write log information to different backends.
tag = self._collect_scalars(custom_cfg_copy, runner, mode)
# validation log string needs cur epoch/iteration and max
# epochs/iterations. test log string only needs length of test
# dataloader.
cur_iter = self._get_iter(runner, batch_idx)
# By epoch:
# Epoch(val) [10][1000/1000] ...
# Epoch(test) [1000/1000] ...
# By iteration:
# Iteration(val) [1000/1000] ...
# Iteration(test) [1000/1000] ...
if self.by_epoch:
if mode == 'val':
cur_epoch = self._get_epoch(runner, mode)
@ -214,12 +216,9 @@ class LogProcessor:
f'Epoch({mode}) [{dataloader_len}/{dataloader_len}] ')
else:
if mode == 'train':
log_str = (f'Iter({mode}) [{cur_iter}/'
f'{runner.train_loop.max_iters}] ')
else:
log_str = (
f'Iter({mode}) [{dataloader_len}/{dataloader_len}] ')
log_str = (f'Iter({mode}) [{dataloader_len}/{dataloader_len}] ')
# `time` and `data_time` will not be recorded in after epoch log
# message.
log_items = []
for name, val in tag.items():
if name in ('time', 'data_time'):
@ -237,9 +236,9 @@ class LogProcessor:
Args:
custom_cfg (List[dict]): A copy of ``self.custom_cfg`` with int
``window_size``.
runner (Runner): The runner of the training process.
mode (str): 'train' or 'val', which means the prefix attached by
runner.
runner (Runner): The runner of the training/testing/validation
process.
mode (str): Current mode of runner.
Returns:
dict: Statistical values of logs.
@ -253,7 +252,7 @@ class LogProcessor:
# according to mode.
for prefix_key, log_buffer in history_scalars.items():
if prefix_key.startswith(mode):
key = prefix_key.split('/')[-1]
key = prefix_key.partition('/')[-1]
mode_history_scalars[key] = log_buffer
for key in mode_history_scalars:
# Update the latest learning rate and smoothed time logs.
@ -287,25 +286,23 @@ class LogProcessor:
' is False.'
def _check_repeated_log_name():
check_dict = dict()
# The `log_name` of the same data_src should not be repeated.
# If `log_name` is not specified, `data_src` will be overwritten.
# But only allowed to be overwritten once.
check_set = set()
for log_cfg in self.custom_cfg:
assert 'data_src' in log_cfg
data_src = log_cfg['data_src']
log_name = log_cfg.get('log_name', data_src)
check_dict.setdefault(data_src,
dict(log_names=set(), log_counts=0))
check_dict[data_src]['log_names'].add(log_name)
check_dict[data_src]['log_counts'] += 1
assert (len(
check_dict[data_src]
['log_names']) == check_dict[data_src]['log_counts']), (
f'If you want to statistic {data_src} with multiple '
'statistics method, please check `log_name` is unique'
f'and {data_src} will not be overwritten twice. See '
f'more information in the docstring of `LogProcessor`')
assert log_name not in check_set, (
f'Found duplicate {log_name} for {data_src}. Please check'
'your `custom_cfg` for `log_processor`. You should '
f'neither define duplicate `{log_name}` for {data_src} '
f'nor do not define any {log_name} for multiple '
f'{data_src}, See more information in the docstring of '
'LogProcessor')
check_set.add(log_name)
_check_repeated_log_name()
_check_window_size()
@ -314,7 +311,8 @@ class LogProcessor:
"""Parse window_size defined in custom_cfg to int value.
Args:
runner (Runner): The runner of the training process.
runner (Runner): The runner of the training/testing/validation
process.
batch_idx (int): The iteration index of current dataloader.
"""
custom_cfg_copy = copy.deepcopy(self.custom_cfg)
@ -337,7 +335,8 @@ class LogProcessor:
for a given device.
Args:
runner (Runner): The runner of the training process.
runner (Runner): The runner of the training/testing/validation
process.
Returns:
The maximum GPU memory occupied by tensors in megabytes for a given
@ -352,11 +351,12 @@ class LogProcessor:
return int(mem_mb.item())
def _get_iter(self, runner, batch_idx: int = None) -> int:
"""Get current training iteration step.
"""Get current iteration index.
Args:
runner (Runner): The runner of the training process.
batch_idx (int, optional): The interaction index of current
runner (Runner): The runner of the training/testing/validation
process.
batch_idx (int, optional): The iteration index of current
dataloader. Defaults to None.
Returns:
@ -372,8 +372,9 @@ class LogProcessor:
"""Get current epoch according to mode.
Args:
runner (Runner): The runner of the training/validation process.
mode (str): Current mode of runner, "train" or "val".
runner (Runner): The runner of the training/testing/validation
process.
mode (str): Current mode of runner.
Returns:
int: The current epoch.
@ -395,7 +396,7 @@ class LogProcessor:
Args:
runner (Runner): The runner of the training/validation/testing
process.
mode (str): Current mode of runner, "train", "val" or test.
mode (str): Current mode of runner.
Returns:
BaseLoop: Current loop of runner.

View File

@ -64,7 +64,9 @@ class Runner:
Args:
model (:obj:`torch.nn.Module` or dict): The model to be run. It can be
a dict used for build a model.
work_dir (str): The working directory to save checkpoints and logs.
work_dir (str): The working directory to save checkpoints. The logs
will be saved in the subdirectory of `work_dir` named
:attr:`timestamp`.
train_dataloader (Dataloader or dict, optional): A dataloader object or
a dict to build a dataloader. If ``None`` is given, it means
skipping training steps. Defaults to None.
@ -637,7 +639,7 @@ class Runner:
MMLogger: A MMLogger object build from ``logger``.
"""
if log_file is None:
log_file = osp.join(self._log_dir, f'{self._experiment_name}.log')
log_file = osp.join(self._log_dir, f'{self.timestamp}.log')
log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
log_cfg.setdefault('name', self._experiment_name)

View File

@ -15,7 +15,7 @@ class TestLoggerHook:
assert logger_hook.interval == 10
assert logger_hook.ignore_last
assert logger_hook.interval_exp_name == 1000
assert logger_hook.out_suffix == ('.log.json', '.log', '.py')
assert logger_hook.out_suffix == ('.json', '.log', '.py', 'yaml')
assert logger_hook.keep_local
assert logger_hook.file_client_args is None
assert isinstance(logger_hook.file_client.client, HardDiskBackend)
@ -29,36 +29,42 @@ class TestLoggerHook:
def test_before_run(self):
runner = MagicMock()
runner.iter = 10
runner.timestamp = 'timestamp'
runner.timestamp = '20220429'
runner._log_dir = f'work_dir/{runner.timestamp}'
runner.work_dir = 'work_dir'
runner.logger = MagicMock()
logger_hook = LoggerHook(out_dir='out_dir')
logger_hook.before_run(runner)
assert logger_hook.out_dir == osp.join('out_dir', 'work_dir')
assert logger_hook.json_log_path == osp.join('work_dir',
'timestamp.log.json')
assert logger_hook.json_log_path == f'{runner.timestamp}.json'
def test_after_run(self, tmp_path):
# Test
timestamp = '20220429'
out_dir = tmp_path / 'out_dir'
out_dir.mkdir()
work_dir = tmp_path / 'work_dir'
work_dir.mkdir()
work_dir_json = work_dir / 'tmp.log.json'
log_dir = work_dir / timestamp
log_dir.mkdir()
log_dir_json = log_dir / 'tmp.log.json'
runner = MagicMock()
runner.work_dir = work_dir
runner._log_dir = str(log_dir)
runner.timestamp = timestamp
runner.work_dir = str(work_dir)
# Test without out_dir.
logger_hook = LoggerHook()
logger_hook.after_run(runner)
# Test with out_dir and make sure json file has been moved to out_dir.
json_f = open(work_dir_json, 'w')
json_f = open(log_dir_json, 'w')
json_f.close()
logger_hook = LoggerHook(out_dir=str(tmp_path), keep_local=False)
logger_hook = LoggerHook(out_dir=str(out_dir), keep_local=False)
logger_hook.out_dir = str(out_dir)
logger_hook.before_run(runner)
logger_hook.after_run(runner)
# Verify that the file has been moved to `out_dir`.
assert not osp.exists(str(work_dir_json))
assert osp.exists(str(out_dir / 'tmp.log.json'))
assert not osp.exists(str(log_dir_json))
assert osp.exists(str(out_dir / 'work_dir' / 'tmp.log.json'))
def test_after_train_iter(self):
# Test LoggerHook by iter.
@ -89,7 +95,7 @@ class TestLoggerHook:
runner.log_processor.get_log_after_iter = MagicMock(
return_value=(dict(), 'log_str'))
logger_hook = LoggerHook(ignore_last=False)
runner.train_dataloader = [0] * 5
runner.train_loop.dataloader = [0] * 5
logger_hook.after_train_iter(runner, batch_idx=4)
runner.log_processor.get_log_after_iter.assert_called()