[Fix] Fix some code and docstring in LogProcessor and LoggerHook (#213)

* fix logger hook

* fix to record multi-level logs, log train/a/b to a/b

* fix loggerhook json path

* fix uunit test

* change runner.train_dataloader to runner.train_loop.dataloader

* clean debug code

* refie comments and docstring

* fix unit test
This commit is contained in:
Mashiro 2022-05-06 14:33:33 +08:00 committed by GitHub
parent 5c5c03e648
commit 859f4d1580
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 97 additions and 88 deletions

View File

@ -20,7 +20,7 @@ class LoggerHook(Hook):
``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during ``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during
training/validation/testing phase. It is used to control following training/validation/testing phase. It is used to control following
behaviers: behaviors:
- The frequency of logs update in terminal, local, tensorboad wandb.etc. - The frequency of logs update in terminal, local, tensorboad wandb.etc.
- The frequency of show experiment information in terminal. - The frequency of show experiment information in terminal.
@ -41,10 +41,10 @@ class LoggerHook(Hook):
of ``out_dir`` and the last level directory of of ``out_dir`` and the last level directory of
``runner.work_dir``. For example, if the input ``our_dir`` is ``runner.work_dir``. For example, if the input ``our_dir`` is
``./tmp`` and ``runner.work_dir`` is ``./work_dir/cur_exp``, ``./tmp`` and ``runner.work_dir`` is ``./work_dir/cur_exp``,
then the log will be saved in ``./tmp/cur_exp``. Deafule to None. then the log will be saved in ``./tmp/cur_exp``. Defaults to None.
out_suffix (Tuple[str] or str): Those filenames ending with out_suffix (Tuple[str] or str): Those files in ``runner._log_dir``
``out_suffix`` will be copied to ``out_dir``. Defaults to ending with ``out_suffix`` will be copied to ``out_dir``. Defaults
('.log.json', '.log', '.py'). to ('json', '.log', '.py').
keep_local (bool): Whether to keep local logs in the local machine keep_local (bool): Whether to keep local logs in the local machine
when :attr:`out_dir` is specified. If False, the local log will be when :attr:`out_dir` is specified. If False, the local log will be
removed. Defaults to True. removed. Defaults to True.
@ -53,7 +53,7 @@ class LoggerHook(Hook):
Defaults to None. Defaults to None.
Examples: Examples:
>>> # A simplest LoggerHook config. >>> # The simplest LoggerHook config.
>>> logger_hook_cfg = dict(interval=20) >>> logger_hook_cfg = dict(interval=20)
""" """
priority = 'BELOW_NORMAL' priority = 'BELOW_NORMAL'
@ -64,7 +64,8 @@ class LoggerHook(Hook):
ignore_last: bool = True, ignore_last: bool = True,
interval_exp_name: int = 1000, interval_exp_name: int = 1000,
out_dir: Optional[Union[str, Path]] = None, out_dir: Optional[Union[str, Path]] = None,
out_suffix: Union[Sequence[str], str] = ('.log.json', '.log', '.py'), out_suffix: Union[Sequence[str],
str] = ('.json', '.log', '.py', 'yaml'),
keep_local: bool = True, keep_local: bool = True,
file_client_args: Optional[dict] = None, file_client_args: Optional[dict] = None,
): ):
@ -86,6 +87,7 @@ class LoggerHook(Hook):
self.keep_local = keep_local self.keep_local = keep_local
self.file_client_args = file_client_args self.file_client_args = file_client_args
self.json_log_path: Optional[str] = None
if self.out_dir is not None: if self.out_dir is not None:
self.file_client = FileClient.infer_client(file_client_args, self.file_client = FileClient.infer_client(file_client_args,
self.out_dir) self.out_dir)
@ -106,36 +108,32 @@ class LoggerHook(Hook):
(f'Text logs will be saved to {self.out_dir} by ' (f'Text logs will be saved to {self.out_dir} by '
f'{self.file_client.name} after the training process.')) f'{self.file_client.name} after the training process.'))
self.json_log_path = osp.join(runner.work_dir, self.json_log_path = f'{runner.timestamp}.json'
f'{runner.timestamp}.log.json')
self.yaml_log_path = osp.join(runner.work_dir,
f'{runner.timestamp}.log.json')
def after_train_iter(self, def after_train_iter(self,
runner, runner,
batch_idx: int, batch_idx: int,
data_batch: DATA_BATCH = None, data_batch: DATA_BATCH = None,
outputs: Optional[dict] = None) -> None: outputs: Optional[dict] = None) -> None:
"""Record training logs after training iteration. """Record logs after training iteration.
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the training process.
batch_idx (int): The index of the current batch in the train loop. batch_idx (int): The index of the current batch in the train loop.
data_batch (Sequence[dict], optional): Data from dataloader. data_batch (Sequence[dict], optional): Data from dataloader.
Defaults to None. Defaults to None.
outputs (dict, optional): Outputs from model. outputs (dict, optional): Outputs from model. Defaults to None.
Defaults to None.
""" """
# Print experiment name every n iterations. # Print experiment name every n iterations.
if self.every_n_iters(runner, if self.every_n_iters(runner,
self.interval_exp_name) or (self.end_of_epoch( self.interval_exp_name) or (self.end_of_epoch(
runner.train_dataloader, batch_idx)): runner.train_loop.dataloader, batch_idx)):
exp_info = f'Exp name: {runner.experiment_name}' exp_info = f'Exp name: {runner.experiment_name}'
runner.logger.info(exp_info) runner.logger.info(exp_info)
if self.every_n_inner_iters(batch_idx, self.interval): if self.every_n_inner_iters(batch_idx, self.interval):
tag, log_str = runner.log_processor.get_log_after_iter( tag, log_str = runner.log_processor.get_log_after_iter(
runner, batch_idx, 'train') runner, batch_idx, 'train')
elif (self.end_of_epoch(runner.train_dataloader, batch_idx) elif (self.end_of_epoch(runner.train_loop.dataloader, batch_idx)
and not self.ignore_last): and not self.ignore_last):
# `runner.max_iters` may not be divisible by `self.interval`. if # `runner.max_iters` may not be divisible by `self.interval`. if
# `self.ignore_last==True`, the log of remaining iterations will # `self.ignore_last==True`, the log of remaining iterations will
@ -146,8 +144,8 @@ class LoggerHook(Hook):
else: else:
return return
runner.logger.info(log_str) runner.logger.info(log_str)
# TODO compatible with visualizer. runner.visualizer.add_scalars(
runner.visualizer.add_scalars(tag, step=runner.iter + 1) tag, step=runner.iter + 1, file_path=self.json_log_path)
def after_val_iter( def after_val_iter(
self, self,
@ -155,17 +153,18 @@ class LoggerHook(Hook):
batch_idx: int, batch_idx: int,
data_batch: DATA_BATCH = None, data_batch: DATA_BATCH = None,
outputs: Optional[Sequence[BaseDataElement]] = None) -> None: outputs: Optional[Sequence[BaseDataElement]] = None) -> None:
"""Record validation logs after validation iteration. """Record logs after validation iteration.
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the validation process.
batch_idx (int): The index of the current batch in the train loop. batch_idx (int): The index of the current batch in the validation
data_batch (Sequence[Tuple[Any, BaseDataElement]], optional): loop.
Data from dataloader. Defaults to None. data_batch (Sequence[dict], optional): Data from dataloader.
Defaults to None.
outputs (sequence, optional): Outputs from model. Defaults to None. outputs (sequence, optional): Outputs from model. Defaults to None.
""" """
if self.every_n_inner_iters(batch_idx, self.interval): if self.every_n_inner_iters(batch_idx, self.interval):
tag, log_str = runner.log_processor.get_log_after_iter( _, log_str = runner.log_processor.get_log_after_iter(
runner, batch_idx, 'val') runner, batch_idx, 'val')
runner.logger.info(log_str) runner.logger.info(log_str)
@ -175,39 +174,39 @@ class LoggerHook(Hook):
batch_idx: int, batch_idx: int,
data_batch: DATA_BATCH = None, data_batch: DATA_BATCH = None,
outputs: Optional[Sequence[BaseDataElement]] = None) -> None: outputs: Optional[Sequence[BaseDataElement]] = None) -> None:
"""Record testing logs after iteration. """Record logs after testing iteration.
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the testing process.
batch_idx (int): The index of the current batch in the train loop. batch_idx (int): The index of the current batch in the test loop.
data_batch (Sequence[Tuple[Any, BaseDataElement]], optional): data_batch (Sequence[dict], optional): Data from dataloader.
Data from dataloader. Defaults to None. Defaults to None.
outputs (sequence, optional): Outputs from model. Defaults to None. outputs (sequence, optional): Outputs from model. Defaults to None.
""" """
if self.every_n_inner_iters(batch_idx, self.interval): if self.every_n_inner_iters(batch_idx, self.interval):
tag, log_str = runner.log_processor.get_log_after_iter( _, log_str = runner.log_processor.get_log_after_iter(
runner, batch_idx, 'test') runner, batch_idx, 'test')
runner.logger.info(log_str) runner.logger.info(log_str)
def after_val_epoch(self, runner) -> None: def after_val_epoch(self, runner) -> None:
"""Record validation logs after validation epoch. """Record logs after validation epoch.
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the validation process.
""" """
tag, log_str = runner.log_processor.get_log_after_epoch( tag, log_str = runner.log_processor.get_log_after_epoch(
runner, len(runner.val_dataloader), 'val') runner, len(runner.val_dataloader), 'val')
runner.logger.info(log_str) runner.logger.info(log_str)
# TODO compatible with visualizer. runner.visualizer.add_scalars(
runner.visualizer.add_scalars(tag, step=runner.iter + 1) tag, step=runner.iter, file_path=self.json_log_path)
def after_test_epoch(self, runner) -> None: def after_test_epoch(self, runner) -> None:
"""Record testing logs after test epoch. """Record logs after testing epoch.
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the testing process.
""" """
tag, log_str = runner.log_processor.get_log_after_epoch( _, log_str = runner.log_processor.get_log_after_epoch(
runner, len(runner.val_dataloader), 'test') runner, len(runner.val_dataloader), 'test')
runner.logger.info(log_str) runner.logger.info(log_str)
@ -215,13 +214,14 @@ class LoggerHook(Hook):
"""Copy logs to ``self.out_dir`` if ``self.out_dir is not None`` """Copy logs to ``self.out_dir`` if ``self.out_dir is not None``
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the training/testing/validation
process.
""" """
# copy or upload logs to self.out_dir # copy or upload logs to self.out_dir
if self.out_dir is None: if self.out_dir is None:
return return
for filename in scandir(runner.work_dir, self.out_suffix, True): for filename in scandir(runner._log_dir, self.out_suffix, True):
local_filepath = osp.join(runner.work_dir, filename) local_filepath = osp.join(runner._log_dir, filename)
out_filepath = self.file_client.join_path(self.out_dir, filename) out_filepath = self.file_client.join_path(self.out_dir, filename)
with open(local_filepath, 'r') as f: with open(local_filepath, 'r') as f:
self.file_client.put_text(f.read(), out_filepath) self.file_client.put_text(f.read(), out_filepath)

View File

@ -181,7 +181,7 @@ class LogProcessor:
"""Format log string after validation or testing epoch. """Format log string after validation or testing epoch.
Args: Args:
runner (Runner): The runner of training phase. runner (Runner): The runner of validation/testing phase.
batch_idx (int): The index of the current batch in the current batch_idx (int): The index of the current batch in the current
loop. loop.
mode (str): Current mode of runner. mode (str): Current mode of runner.
@ -200,10 +200,12 @@ class LogProcessor:
custom_cfg_copy = self._parse_windows_size(runner, batch_idx) custom_cfg_copy = self._parse_windows_size(runner, batch_idx)
# tag is used to write log information to different backends. # tag is used to write log information to different backends.
tag = self._collect_scalars(custom_cfg_copy, runner, mode) tag = self._collect_scalars(custom_cfg_copy, runner, mode)
# validation log string needs cur epoch/iteration and max # By epoch:
# epochs/iterations. test log string only needs length of test # Epoch(val) [10][1000/1000] ...
# dataloader. # Epoch(test) [1000/1000] ...
cur_iter = self._get_iter(runner, batch_idx) # By iteration:
# Iteration(val) [1000/1000] ...
# Iteration(test) [1000/1000] ...
if self.by_epoch: if self.by_epoch:
if mode == 'val': if mode == 'val':
cur_epoch = self._get_epoch(runner, mode) cur_epoch = self._get_epoch(runner, mode)
@ -214,12 +216,9 @@ class LogProcessor:
f'Epoch({mode}) [{dataloader_len}/{dataloader_len}] ') f'Epoch({mode}) [{dataloader_len}/{dataloader_len}] ')
else: else:
if mode == 'train': log_str = (f'Iter({mode}) [{dataloader_len}/{dataloader_len}] ')
log_str = (f'Iter({mode}) [{cur_iter}/' # `time` and `data_time` will not be recorded in after epoch log
f'{runner.train_loop.max_iters}] ') # message.
else:
log_str = (
f'Iter({mode}) [{dataloader_len}/{dataloader_len}] ')
log_items = [] log_items = []
for name, val in tag.items(): for name, val in tag.items():
if name in ('time', 'data_time'): if name in ('time', 'data_time'):
@ -237,9 +236,9 @@ class LogProcessor:
Args: Args:
custom_cfg (List[dict]): A copy of ``self.custom_cfg`` with int custom_cfg (List[dict]): A copy of ``self.custom_cfg`` with int
``window_size``. ``window_size``.
runner (Runner): The runner of the training process. runner (Runner): The runner of the training/testing/validation
mode (str): 'train' or 'val', which means the prefix attached by process.
runner. mode (str): Current mode of runner.
Returns: Returns:
dict: Statistical values of logs. dict: Statistical values of logs.
@ -253,7 +252,7 @@ class LogProcessor:
# according to mode. # according to mode.
for prefix_key, log_buffer in history_scalars.items(): for prefix_key, log_buffer in history_scalars.items():
if prefix_key.startswith(mode): if prefix_key.startswith(mode):
key = prefix_key.split('/')[-1] key = prefix_key.partition('/')[-1]
mode_history_scalars[key] = log_buffer mode_history_scalars[key] = log_buffer
for key in mode_history_scalars: for key in mode_history_scalars:
# Update the latest learning rate and smoothed time logs. # Update the latest learning rate and smoothed time logs.
@ -287,25 +286,23 @@ class LogProcessor:
' is False.' ' is False.'
def _check_repeated_log_name(): def _check_repeated_log_name():
check_dict = dict()
# The `log_name` of the same data_src should not be repeated. # The `log_name` of the same data_src should not be repeated.
# If `log_name` is not specified, `data_src` will be overwritten. # If `log_name` is not specified, `data_src` will be overwritten.
# But only allowed to be overwritten once. # But only allowed to be overwritten once.
check_set = set()
for log_cfg in self.custom_cfg: for log_cfg in self.custom_cfg:
assert 'data_src' in log_cfg assert 'data_src' in log_cfg
data_src = log_cfg['data_src'] data_src = log_cfg['data_src']
log_name = log_cfg.get('log_name', data_src) log_name = log_cfg.get('log_name', data_src)
check_dict.setdefault(data_src, assert log_name not in check_set, (
dict(log_names=set(), log_counts=0)) f'Found duplicate {log_name} for {data_src}. Please check'
check_dict[data_src]['log_names'].add(log_name) 'your `custom_cfg` for `log_processor`. You should '
check_dict[data_src]['log_counts'] += 1 f'neither define duplicate `{log_name}` for {data_src} '
assert (len( f'nor do not define any {log_name} for multiple '
check_dict[data_src] f'{data_src}, See more information in the docstring of '
['log_names']) == check_dict[data_src]['log_counts']), ( 'LogProcessor')
f'If you want to statistic {data_src} with multiple '
'statistics method, please check `log_name` is unique' check_set.add(log_name)
f'and {data_src} will not be overwritten twice. See '
f'more information in the docstring of `LogProcessor`')
_check_repeated_log_name() _check_repeated_log_name()
_check_window_size() _check_window_size()
@ -314,7 +311,8 @@ class LogProcessor:
"""Parse window_size defined in custom_cfg to int value. """Parse window_size defined in custom_cfg to int value.
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the training/testing/validation
process.
batch_idx (int): The iteration index of current dataloader. batch_idx (int): The iteration index of current dataloader.
""" """
custom_cfg_copy = copy.deepcopy(self.custom_cfg) custom_cfg_copy = copy.deepcopy(self.custom_cfg)
@ -337,7 +335,8 @@ class LogProcessor:
for a given device. for a given device.
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the training/testing/validation
process.
Returns: Returns:
The maximum GPU memory occupied by tensors in megabytes for a given The maximum GPU memory occupied by tensors in megabytes for a given
@ -352,11 +351,12 @@ class LogProcessor:
return int(mem_mb.item()) return int(mem_mb.item())
def _get_iter(self, runner, batch_idx: int = None) -> int: def _get_iter(self, runner, batch_idx: int = None) -> int:
"""Get current training iteration step. """Get current iteration index.
Args: Args:
runner (Runner): The runner of the training process. runner (Runner): The runner of the training/testing/validation
batch_idx (int, optional): The interaction index of current process.
batch_idx (int, optional): The iteration index of current
dataloader. Defaults to None. dataloader. Defaults to None.
Returns: Returns:
@ -372,8 +372,9 @@ class LogProcessor:
"""Get current epoch according to mode. """Get current epoch according to mode.
Args: Args:
runner (Runner): The runner of the training/validation process. runner (Runner): The runner of the training/testing/validation
mode (str): Current mode of runner, "train" or "val". process.
mode (str): Current mode of runner.
Returns: Returns:
int: The current epoch. int: The current epoch.
@ -395,7 +396,7 @@ class LogProcessor:
Args: Args:
runner (Runner): The runner of the training/validation/testing runner (Runner): The runner of the training/validation/testing
process. process.
mode (str): Current mode of runner, "train", "val" or test. mode (str): Current mode of runner.
Returns: Returns:
BaseLoop: Current loop of runner. BaseLoop: Current loop of runner.

View File

@ -64,7 +64,9 @@ class Runner:
Args: Args:
model (:obj:`torch.nn.Module` or dict): The model to be run. It can be model (:obj:`torch.nn.Module` or dict): The model to be run. It can be
a dict used for build a model. a dict used for build a model.
work_dir (str): The working directory to save checkpoints and logs. work_dir (str): The working directory to save checkpoints. The logs
will be saved in the subdirectory of `work_dir` named
:attr:`timestamp`.
train_dataloader (Dataloader or dict, optional): A dataloader object or train_dataloader (Dataloader or dict, optional): A dataloader object or
a dict to build a dataloader. If ``None`` is given, it means a dict to build a dataloader. If ``None`` is given, it means
skipping training steps. Defaults to None. skipping training steps. Defaults to None.
@ -637,7 +639,7 @@ class Runner:
MMLogger: A MMLogger object build from ``logger``. MMLogger: A MMLogger object build from ``logger``.
""" """
if log_file is None: if log_file is None:
log_file = osp.join(self._log_dir, f'{self._experiment_name}.log') log_file = osp.join(self._log_dir, f'{self.timestamp}.log')
log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs) log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
log_cfg.setdefault('name', self._experiment_name) log_cfg.setdefault('name', self._experiment_name)

View File

@ -15,7 +15,7 @@ class TestLoggerHook:
assert logger_hook.interval == 10 assert logger_hook.interval == 10
assert logger_hook.ignore_last assert logger_hook.ignore_last
assert logger_hook.interval_exp_name == 1000 assert logger_hook.interval_exp_name == 1000
assert logger_hook.out_suffix == ('.log.json', '.log', '.py') assert logger_hook.out_suffix == ('.json', '.log', '.py', 'yaml')
assert logger_hook.keep_local assert logger_hook.keep_local
assert logger_hook.file_client_args is None assert logger_hook.file_client_args is None
assert isinstance(logger_hook.file_client.client, HardDiskBackend) assert isinstance(logger_hook.file_client.client, HardDiskBackend)
@ -29,36 +29,42 @@ class TestLoggerHook:
def test_before_run(self): def test_before_run(self):
runner = MagicMock() runner = MagicMock()
runner.iter = 10 runner.iter = 10
runner.timestamp = 'timestamp' runner.timestamp = '20220429'
runner._log_dir = f'work_dir/{runner.timestamp}'
runner.work_dir = 'work_dir' runner.work_dir = 'work_dir'
runner.logger = MagicMock() runner.logger = MagicMock()
logger_hook = LoggerHook(out_dir='out_dir') logger_hook = LoggerHook(out_dir='out_dir')
logger_hook.before_run(runner) logger_hook.before_run(runner)
assert logger_hook.out_dir == osp.join('out_dir', 'work_dir') assert logger_hook.out_dir == osp.join('out_dir', 'work_dir')
assert logger_hook.json_log_path == osp.join('work_dir', assert logger_hook.json_log_path == f'{runner.timestamp}.json'
'timestamp.log.json')
def test_after_run(self, tmp_path): def test_after_run(self, tmp_path):
# Test # Test
timestamp = '20220429'
out_dir = tmp_path / 'out_dir' out_dir = tmp_path / 'out_dir'
out_dir.mkdir() out_dir.mkdir()
work_dir = tmp_path / 'work_dir' work_dir = tmp_path / 'work_dir'
work_dir.mkdir() work_dir.mkdir()
work_dir_json = work_dir / 'tmp.log.json' log_dir = work_dir / timestamp
log_dir.mkdir()
log_dir_json = log_dir / 'tmp.log.json'
runner = MagicMock() runner = MagicMock()
runner.work_dir = work_dir runner._log_dir = str(log_dir)
runner.timestamp = timestamp
runner.work_dir = str(work_dir)
# Test without out_dir. # Test without out_dir.
logger_hook = LoggerHook() logger_hook = LoggerHook()
logger_hook.after_run(runner) logger_hook.after_run(runner)
# Test with out_dir and make sure json file has been moved to out_dir. # Test with out_dir and make sure json file has been moved to out_dir.
json_f = open(work_dir_json, 'w') json_f = open(log_dir_json, 'w')
json_f.close() json_f.close()
logger_hook = LoggerHook(out_dir=str(tmp_path), keep_local=False) logger_hook = LoggerHook(out_dir=str(out_dir), keep_local=False)
logger_hook.out_dir = str(out_dir) logger_hook.out_dir = str(out_dir)
logger_hook.before_run(runner)
logger_hook.after_run(runner) logger_hook.after_run(runner)
# Verify that the file has been moved to `out_dir`. # Verify that the file has been moved to `out_dir`.
assert not osp.exists(str(work_dir_json)) assert not osp.exists(str(log_dir_json))
assert osp.exists(str(out_dir / 'tmp.log.json')) assert osp.exists(str(out_dir / 'work_dir' / 'tmp.log.json'))
def test_after_train_iter(self): def test_after_train_iter(self):
# Test LoggerHook by iter. # Test LoggerHook by iter.
@ -89,7 +95,7 @@ class TestLoggerHook:
runner.log_processor.get_log_after_iter = MagicMock( runner.log_processor.get_log_after_iter = MagicMock(
return_value=(dict(), 'log_str')) return_value=(dict(), 'log_str'))
logger_hook = LoggerHook(ignore_last=False) logger_hook = LoggerHook(ignore_last=False)
runner.train_dataloader = [0] * 5 runner.train_loop.dataloader = [0] * 5
logger_hook.after_train_iter(runner, batch_idx=4) logger_hook.after_train_iter(runner, batch_idx=4)
runner.log_processor.get_log_after_iter.assert_called() runner.log_processor.get_log_after_iter.assert_called()