[Fix] Fix some code and docstring in LogProcessor and LoggerHook (#213)

* fix logger hook * fix to record multi-level logs, log train/a/b to a/b * fix loggerhook json path * fix uunit test * change runner.train_dataloader to runner.train_loop.dataloader * clean debug code * refie comments and docstring * fix unit test
2025-06-03 21:54:44 +08:00 · 2022-05-06 14:33:33 +08:00 · 2022-05-06 14:33:33 +08:00 · 859f4d1580
commit 859f4d1580
parent 5c5c03e648
4 changed files with 97 additions and 88 deletions
--- a/mmengine/hooks/logger_hook.py
+++ b/mmengine/hooks/logger_hook.py
@ -20,7 +20,7 @@ class LoggerHook(Hook):

    ``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during
    training/validation/testing phase. It is used to control following
-    behaviers:
+    behaviors:

    - The frequency of logs update in terminal, local, tensorboad wandb.etc.
    - The frequency of show experiment information in terminal.
@ -41,10 +41,10 @@ class LoggerHook(Hook):
             of ``out_dir`` and the last level directory of
            ``runner.work_dir``. For example, if the input ``our_dir`` is
            ``./tmp`` and ``runner.work_dir`` is ``./work_dir/cur_exp``,
-            then the log will be saved in ``./tmp/cur_exp``. Deafule to None.
-        out_suffix (Tuple[str] or str): Those filenames ending with
-            ``out_suffix`` will be copied to ``out_dir``. Defaults to
-            ('.log.json', '.log', '.py').
+            then the log will be saved in ``./tmp/cur_exp``. Defaults to None.
+        out_suffix (Tuple[str] or str): Those files in ``runner._log_dir``
+            ending with ``out_suffix`` will be copied to ``out_dir``. Defaults
+            to ('json', '.log', '.py').
        keep_local (bool): Whether to keep local logs in the local machine
            when :attr:`out_dir` is specified. If False, the local log will be
            removed. Defaults to True.
@ -53,7 +53,7 @@ class LoggerHook(Hook):
            Defaults to None.

    Examples:
-        >>> # A simplest LoggerHook config.
+        >>> # The simplest LoggerHook config.
        >>> logger_hook_cfg = dict(interval=20)
    """
    priority = 'BELOW_NORMAL'
@ -64,7 +64,8 @@ class LoggerHook(Hook):
        ignore_last: bool = True,
        interval_exp_name: int = 1000,
        out_dir: Optional[Union[str, Path]] = None,
-        out_suffix: Union[Sequence[str], str] = ('.log.json', '.log', '.py'),
+        out_suffix: Union[Sequence[str],
+                          str] = ('.json', '.log', '.py', 'yaml'),
        keep_local: bool = True,
        file_client_args: Optional[dict] = None,
    ):
@ -86,6 +87,7 @@ class LoggerHook(Hook):

        self.keep_local = keep_local
        self.file_client_args = file_client_args
+        self.json_log_path: Optional[str] = None
        if self.out_dir is not None:
            self.file_client = FileClient.infer_client(file_client_args,
                                                       self.out_dir)
@ -106,36 +108,32 @@ class LoggerHook(Hook):
                (f'Text logs will be saved to {self.out_dir} by '
                 f'{self.file_client.name} after the training process.'))

-        self.json_log_path = osp.join(runner.work_dir,
-                                      f'{runner.timestamp}.log.json')
-        self.yaml_log_path = osp.join(runner.work_dir,
-                                      f'{runner.timestamp}.log.json')
+        self.json_log_path = f'{runner.timestamp}.json'

    def after_train_iter(self,
                         runner,
                         batch_idx: int,
                         data_batch: DATA_BATCH = None,
                         outputs: Optional[dict] = None) -> None:
-        """Record training logs after training iteration.
+        """Record logs after training iteration.

        Args:
            runner (Runner): The runner of the training process.
            batch_idx (int): The index of the current batch in the train loop.
            data_batch (Sequence[dict], optional): Data from dataloader.
                Defaults to None.
-            outputs (dict, optional): Outputs from model.
-                Defaults to None.
+            outputs (dict, optional): Outputs from model. Defaults to None.
        """
        # Print experiment name every n iterations.
        if self.every_n_iters(runner,
                              self.interval_exp_name) or (self.end_of_epoch(
-                                  runner.train_dataloader, batch_idx)):
+                                  runner.train_loop.dataloader, batch_idx)):
            exp_info = f'Exp name: {runner.experiment_name}'
            runner.logger.info(exp_info)
        if self.every_n_inner_iters(batch_idx, self.interval):
            tag, log_str = runner.log_processor.get_log_after_iter(
                runner, batch_idx, 'train')
-        elif (self.end_of_epoch(runner.train_dataloader, batch_idx)
+        elif (self.end_of_epoch(runner.train_loop.dataloader, batch_idx)
              and not self.ignore_last):
            # `runner.max_iters` may not be divisible by `self.interval`. if
            # `self.ignore_last==True`, the log of remaining iterations will
@ -146,8 +144,8 @@ class LoggerHook(Hook):
        else:
            return
        runner.logger.info(log_str)
-        # TODO compatible with visualizer.
-        runner.visualizer.add_scalars(tag, step=runner.iter + 1)
+        runner.visualizer.add_scalars(
+            tag, step=runner.iter + 1, file_path=self.json_log_path)

    def after_val_iter(
            self,
@ -155,17 +153,18 @@ class LoggerHook(Hook):
            batch_idx: int,
            data_batch: DATA_BATCH = None,
            outputs: Optional[Sequence[BaseDataElement]] = None) -> None:
-        """Record validation logs after validation iteration.
+        """Record logs after validation iteration.

        Args:
-            runner (Runner): The runner of the training process.
-            batch_idx (int): The index of the current batch in the train loop.
-            data_batch (Sequence[Tuple[Any, BaseDataElement]], optional):
-                Data from dataloader. Defaults to None.
+            runner (Runner): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the validation
+                loop.
+            data_batch (Sequence[dict], optional): Data from dataloader.
+                Defaults to None.
            outputs (sequence, optional): Outputs from model. Defaults to None.
        """
        if self.every_n_inner_iters(batch_idx, self.interval):
-            tag, log_str = runner.log_processor.get_log_after_iter(
+            _, log_str = runner.log_processor.get_log_after_iter(
                runner, batch_idx, 'val')
            runner.logger.info(log_str)

@ -175,39 +174,39 @@ class LoggerHook(Hook):
            batch_idx: int,
            data_batch: DATA_BATCH = None,
            outputs: Optional[Sequence[BaseDataElement]] = None) -> None:
-        """Record testing logs after iteration.
+        """Record logs after testing iteration.

        Args:
-            runner (Runner): The runner of the training process.
-            batch_idx (int): The index of the current batch in the train loop.
-            data_batch (Sequence[Tuple[Any, BaseDataElement]], optional):
-                Data from dataloader. Defaults to None.
+            runner (Runner): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (Sequence[dict], optional): Data from dataloader.
+                Defaults to None.
            outputs (sequence, optional): Outputs from model. Defaults to None.
        """
        if self.every_n_inner_iters(batch_idx, self.interval):
-            tag, log_str = runner.log_processor.get_log_after_iter(
+            _, log_str = runner.log_processor.get_log_after_iter(
                runner, batch_idx, 'test')
            runner.logger.info(log_str)

    def after_val_epoch(self, runner) -> None:
-        """Record validation logs after validation epoch.
+        """Record logs after validation epoch.

        Args:
-            runner (Runner): The runner of the training process.
+            runner (Runner): The runner of the validation process.
        """
        tag, log_str = runner.log_processor.get_log_after_epoch(
            runner, len(runner.val_dataloader), 'val')
        runner.logger.info(log_str)
-        # TODO compatible with visualizer.
-        runner.visualizer.add_scalars(tag, step=runner.iter + 1)
+        runner.visualizer.add_scalars(
+            tag, step=runner.iter, file_path=self.json_log_path)

    def after_test_epoch(self, runner) -> None:
-        """Record testing logs after test epoch.
+        """Record logs after testing epoch.

        Args:
-            runner (Runner): The runner of the training process.
+            runner (Runner): The runner of the testing process.
        """
-        tag, log_str = runner.log_processor.get_log_after_epoch(
+        _, log_str = runner.log_processor.get_log_after_epoch(
            runner, len(runner.val_dataloader), 'test')
        runner.logger.info(log_str)

@ -215,13 +214,14 @@ class LoggerHook(Hook):
        """Copy logs to ``self.out_dir`` if ``self.out_dir is not None``

        Args:
-            runner (Runner): The runner of the training process.
+            runner (Runner): The runner of the training/testing/validation
+                process.
        """
        # copy or upload logs to self.out_dir
        if self.out_dir is None:
            return
-        for filename in scandir(runner.work_dir, self.out_suffix, True):
-            local_filepath = osp.join(runner.work_dir, filename)
+        for filename in scandir(runner._log_dir, self.out_suffix, True):
+            local_filepath = osp.join(runner._log_dir, filename)
            out_filepath = self.file_client.join_path(self.out_dir, filename)
            with open(local_filepath, 'r') as f:
                self.file_client.put_text(f.read(), out_filepath)
--- a/mmengine/logging/log_processor.py
+++ b/mmengine/logging/log_processor.py
@ -181,7 +181,7 @@ class LogProcessor:
        """Format log string after validation or testing epoch.

        Args:
-            runner (Runner): The runner of training phase.
+            runner (Runner): The runner of validation/testing phase.
            batch_idx (int): The index of the current batch in the current
                loop.
            mode (str): Current mode of runner.
@ -200,10 +200,12 @@ class LogProcessor:
        custom_cfg_copy = self._parse_windows_size(runner, batch_idx)
        # tag is used to write log information to different backends.
        tag = self._collect_scalars(custom_cfg_copy, runner, mode)
-        # validation log string needs cur epoch/iteration and max
-        # epochs/iterations. test log string only needs length of test
-        # dataloader.
-        cur_iter = self._get_iter(runner, batch_idx)
+        # By epoch:
+        #     Epoch(val) [10][1000/1000]  ...
+        #     Epoch(test) [1000/1000] ...
+        # By iteration:
+        #     Iteration(val) [1000/1000]  ...
+        #     Iteration(test) [1000/1000]  ...
        if self.by_epoch:
            if mode == 'val':
                cur_epoch = self._get_epoch(runner, mode)
@ -214,12 +216,9 @@ class LogProcessor:
                    f'Epoch({mode}) [{dataloader_len}/{dataloader_len}]  ')

        else:
-            if mode == 'train':
-                log_str = (f'Iter({mode}) [{cur_iter}/'
-                           f'{runner.train_loop.max_iters}]  ')
-            else:
-                log_str = (
-                    f'Iter({mode}) [{dataloader_len}/{dataloader_len}]  ')
+            log_str = (f'Iter({mode}) [{dataloader_len}/{dataloader_len}]  ')
+        # `time` and `data_time` will not be recorded in after epoch log
+        # message.
        log_items = []
        for name, val in tag.items():
            if name in ('time', 'data_time'):
@ -237,9 +236,9 @@ class LogProcessor:
        Args:
            custom_cfg (List[dict]): A copy of ``self.custom_cfg`` with int
                ``window_size``.
-            runner (Runner): The runner of the training process.
-            mode (str): 'train' or 'val', which means the prefix attached by
-                runner.
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            mode (str): Current mode of runner.

        Returns:
            dict: Statistical values of logs.
@ -253,7 +252,7 @@ class LogProcessor:
        # according to mode.
        for prefix_key, log_buffer in history_scalars.items():
            if prefix_key.startswith(mode):
-                key = prefix_key.split('/')[-1]
+                key = prefix_key.partition('/')[-1]
                mode_history_scalars[key] = log_buffer
        for key in mode_history_scalars:
            # Update the latest learning rate and smoothed time logs.
@ -287,25 +286,23 @@ class LogProcessor:
                        ' is False.'

        def _check_repeated_log_name():
-            check_dict = dict()
            # The `log_name` of the same data_src should not be repeated.
            # If `log_name` is not specified, `data_src` will be overwritten.
            # But only allowed to be overwritten once.
+            check_set = set()
            for log_cfg in self.custom_cfg:
                assert 'data_src' in log_cfg
                data_src = log_cfg['data_src']
                log_name = log_cfg.get('log_name', data_src)
-                check_dict.setdefault(data_src,
-                                      dict(log_names=set(), log_counts=0))
-                check_dict[data_src]['log_names'].add(log_name)
-                check_dict[data_src]['log_counts'] += 1
-                assert (len(
-                    check_dict[data_src]
-                    ['log_names']) == check_dict[data_src]['log_counts']), (
-                        f'If you want to statistic {data_src} with multiple '
-                        'statistics method, please check `log_name` is unique'
-                        f'and {data_src} will not be overwritten twice. See '
-                        f'more information in the docstring of `LogProcessor`')
+                assert log_name not in check_set, (
+                    f'Found duplicate {log_name} for {data_src}. Please check'
+                    'your `custom_cfg` for `log_processor`. You should '
+                    f'neither define duplicate `{log_name}` for {data_src} '
+                    f'nor do not define any {log_name} for multiple '
+                    f'{data_src}, See more information in the docstring of '
+                    'LogProcessor')
+
+                check_set.add(log_name)

        _check_repeated_log_name()
        _check_window_size()
@ -314,7 +311,8 @@ class LogProcessor:
        """Parse window_size defined in custom_cfg to int value.

        Args:
-            runner (Runner): The runner of the training process.
+            runner (Runner): The runner of the training/testing/validation
+                process.
            batch_idx (int): The iteration index of current dataloader.
        """
        custom_cfg_copy = copy.deepcopy(self.custom_cfg)
@ -337,7 +335,8 @@ class LogProcessor:
        for a given device.

        Args:
-            runner (Runner): The runner of the training process.
+            runner (Runner): The runner of the training/testing/validation
+                process.

        Returns:
            The maximum GPU memory occupied by tensors in megabytes for a given
@ -352,11 +351,12 @@ class LogProcessor:
        return int(mem_mb.item())

    def _get_iter(self, runner, batch_idx: int = None) -> int:
-        """Get current training iteration step.
+        """Get current iteration index.

        Args:
-            runner (Runner): The runner of the training process.
-            batch_idx (int, optional): The interaction index of current
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            batch_idx (int, optional): The iteration index of current
                dataloader. Defaults to None.

        Returns:
@ -372,8 +372,9 @@ class LogProcessor:
        """Get current epoch according to mode.

        Args:
-            runner (Runner): The runner of the training/validation process.
-            mode (str): Current mode of runner, "train" or "val".
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            mode (str): Current mode of runner.

        Returns:
            int: The current epoch.
@ -395,7 +396,7 @@ class LogProcessor:
        Args:
            runner (Runner): The runner of the training/validation/testing
                process.
-            mode (str): Current mode of runner, "train", "val" or test.
+            mode (str): Current mode of runner.

        Returns:
            BaseLoop: Current loop of runner.
--- a/mmengine/runner/runner.py
+++ b/mmengine/runner/runner.py
@ -64,7 +64,9 @@ class Runner:
    Args:
        model (:obj:`torch.nn.Module` or dict): The model to be run. It can be
            a dict used for build a model.
-        work_dir (str): The working directory to save checkpoints and logs.
+        work_dir (str): The working directory to save checkpoints. The logs
+            will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`.
        train_dataloader (Dataloader or dict, optional): A dataloader object or
            a dict to build a dataloader. If ``None`` is given, it means
            skipping training steps. Defaults to None.
@ -637,7 +639,7 @@ class Runner:
            MMLogger: A MMLogger object build from ``logger``.
        """
        if log_file is None:
-            log_file = osp.join(self._log_dir, f'{self._experiment_name}.log')
+            log_file = osp.join(self._log_dir, f'{self.timestamp}.log')

        log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
        log_cfg.setdefault('name', self._experiment_name)
--- a/tests/test_hook/test_logger_hook.py
+++ b/tests/test_hook/test_logger_hook.py
@ -15,7 +15,7 @@ class TestLoggerHook:
        assert logger_hook.interval == 10
        assert logger_hook.ignore_last
        assert logger_hook.interval_exp_name == 1000
-        assert logger_hook.out_suffix == ('.log.json', '.log', '.py')
+        assert logger_hook.out_suffix == ('.json', '.log', '.py', 'yaml')
        assert logger_hook.keep_local
        assert logger_hook.file_client_args is None
        assert isinstance(logger_hook.file_client.client, HardDiskBackend)
@ -29,36 +29,42 @@ class TestLoggerHook:
    def test_before_run(self):
        runner = MagicMock()
        runner.iter = 10
-        runner.timestamp = 'timestamp'
+        runner.timestamp = '20220429'
+        runner._log_dir = f'work_dir/{runner.timestamp}'
        runner.work_dir = 'work_dir'
        runner.logger = MagicMock()
        logger_hook = LoggerHook(out_dir='out_dir')
        logger_hook.before_run(runner)
        assert logger_hook.out_dir == osp.join('out_dir', 'work_dir')
-        assert logger_hook.json_log_path == osp.join('work_dir',
-                                                     'timestamp.log.json')
+        assert logger_hook.json_log_path == f'{runner.timestamp}.json'

    def test_after_run(self, tmp_path):
        # Test
+        timestamp = '20220429'
        out_dir = tmp_path / 'out_dir'
        out_dir.mkdir()
        work_dir = tmp_path / 'work_dir'
        work_dir.mkdir()
-        work_dir_json = work_dir / 'tmp.log.json'
+        log_dir = work_dir / timestamp
+        log_dir.mkdir()
+        log_dir_json = log_dir / 'tmp.log.json'
        runner = MagicMock()
-        runner.work_dir = work_dir
+        runner._log_dir = str(log_dir)
+        runner.timestamp = timestamp
+        runner.work_dir = str(work_dir)
        # Test without out_dir.
        logger_hook = LoggerHook()
        logger_hook.after_run(runner)
        # Test with out_dir and make sure json file has been moved to out_dir.
-        json_f = open(work_dir_json, 'w')
+        json_f = open(log_dir_json, 'w')
        json_f.close()
-        logger_hook = LoggerHook(out_dir=str(tmp_path), keep_local=False)
+        logger_hook = LoggerHook(out_dir=str(out_dir), keep_local=False)
        logger_hook.out_dir = str(out_dir)
+        logger_hook.before_run(runner)
        logger_hook.after_run(runner)
        # Verify that the file has been moved to `out_dir`.
-        assert not osp.exists(str(work_dir_json))
-        assert osp.exists(str(out_dir / 'tmp.log.json'))
+        assert not osp.exists(str(log_dir_json))
+        assert osp.exists(str(out_dir / 'work_dir' / 'tmp.log.json'))

    def test_after_train_iter(self):
        # Test LoggerHook by iter.
@ -89,7 +95,7 @@ class TestLoggerHook:
        runner.log_processor.get_log_after_iter = MagicMock(
            return_value=(dict(), 'log_str'))
        logger_hook = LoggerHook(ignore_last=False)
-        runner.train_dataloader = [0] * 5
+        runner.train_loop.dataloader = [0] * 5
        logger_hook.after_train_iter(runner, batch_idx=4)
        runner.log_processor.get_log_after_iter.assert_called()