From 16589ce386ebe5a624e721446f2bb1b91216ff98 Mon Sep 17 00:00:00 2001
From: BayMax_BHL <34785944+BayMaxBHL@users.noreply.github.com>
Date: Tue, 27 Dec 2022 18:58:05 +0800
Subject: [PATCH] [Feature] Add ProfilerHook (#768)

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* [Feature] Add profiler hook functionality

* Apply suggestions from code review

* Update mmengine/hooks/profiler_hook.py

Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
---
 docs/en/api/hooks.rst                  |   1 +
 docs/zh_cn/api/hooks.rst               |   1 +
 mmengine/hooks/__init__.py             |   3 +-
 mmengine/hooks/profiler_hook.py        | 232 +++++++++++++++++++++++++
 tests/test_hooks/test_profiler_hook.py | 204 ++++++++++++++++++++++
 5 files changed, 440 insertions(+), 1 deletion(-)
 create mode 100644 mmengine/hooks/profiler_hook.py
 create mode 100644 tests/test_hooks/test_profiler_hook.py

diff --git a/docs/en/api/hooks.rst b/docs/en/api/hooks.rst
index c061246b..6a7bde36 100644
--- a/docs/en/api/hooks.rst
+++ b/docs/en/api/hooks.rst
@@ -22,3 +22,4 @@ mmengine.hooks
    IterTimerHook
    SyncBuffersHook
    EmptyCacheHook
+   ProfilerHook
diff --git a/docs/zh_cn/api/hooks.rst b/docs/zh_cn/api/hooks.rst
index c061246b..6a7bde36 100644
--- a/docs/zh_cn/api/hooks.rst
+++ b/docs/zh_cn/api/hooks.rst
@@ -22,3 +22,4 @@ mmengine.hooks
    IterTimerHook
    SyncBuffersHook
    EmptyCacheHook
+   ProfilerHook
diff --git a/mmengine/hooks/__init__.py b/mmengine/hooks/__init__.py
index fe326332..d44bc0f5 100644
--- a/mmengine/hooks/__init__.py
+++ b/mmengine/hooks/__init__.py
@@ -7,6 +7,7 @@ from .iter_timer_hook import IterTimerHook
 from .logger_hook import LoggerHook
 from .naive_visualization_hook import NaiveVisualizationHook
 from .param_scheduler_hook import ParamSchedulerHook
+from .profiler_hook import ProfilerHook
 from .runtime_info_hook import RuntimeInfoHook
 from .sampler_seed_hook import DistSamplerSeedHook
 from .sync_buffer_hook import SyncBuffersHook
@@ -14,5 +15,5 @@ from .sync_buffer_hook import SyncBuffersHook
 __all__ = [
     'Hook', 'IterTimerHook', 'DistSamplerSeedHook', 'ParamSchedulerHook',
     'SyncBuffersHook', 'EmptyCacheHook', 'CheckpointHook', 'LoggerHook',
-    'NaiveVisualizationHook', 'EMAHook', 'RuntimeInfoHook'
+    'NaiveVisualizationHook', 'EMAHook', 'RuntimeInfoHook', 'ProfilerHook'
 ]
diff --git a/mmengine/hooks/profiler_hook.py b/mmengine/hooks/profiler_hook.py
new file mode 100644
index 00000000..a585e84d
--- /dev/null
+++ b/mmengine/hooks/profiler_hook.py
@@ -0,0 +1,232 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Callable, Optional, Union
+
+import torch
+
+from mmengine.dist import master_only
+from mmengine.hooks import Hook
+from mmengine.registry import HOOKS
+
+
+def check_kineto() -> bool:  # noqa
+    kineto_exist = False
+    try:
+        if torch.autograd.kineto_available():
+            kineto_exist = True
+    except AttributeError:
+        warnings.warn('NO KINETO')
+    return kineto_exist
+
+
+@HOOKS.register_module()
+class ProfilerHook(Hook):
+    """A hook to analyze performance during training and inference.
+
+    PyTorch Profiler is a tool that allows the collection of the performance
+    metrics during the training. More details on Profiler can be found at
+    `official docs <https://pytorch.org/docs/stable/profiler.html
+    #torch.profiler.profile>`_
+
+    Args:
+        by_epoch (bool): Profile performance by epoch or by iteration.
+            Defaults to True.
+        profile_times (int): The period (epoch/iter) recorded by the profiler.
+            Defaults to 1. For example, profile_iters=10 and by_epoch=False,
+            indicate that 0-10 iterations are recorded.
+        activity_with_cpu (bool): Activities to be used in the analysis (CPU)
+        activity_with_cuda (bool): Activities to be used in the analysis (CUDA)
+        schedule (dict, optional): Key-word arguments passed to
+            `torch.profile.schedule <https://pytorch.org/docs/stable/
+            profiler.html#torch.profiler.schedule>`_.
+            Defaults to None, which means profiling without a schedule
+        on_trace_ready (callable, dict, optional): Either a handler or a dict
+            of generating handler. Defaults to None, which means profiling
+            without an on_trace_ready.The Callable type needs to construct its
+            own function that can handle 'torch.autograd.profiler.profile'.
+            Two officially recommended ways are provided, namely terminal
+            display or tensorboard display. The terminal display content can be
+            adjusted through 'EventList.table()'
+            from 'torch.autograd.profiler_util.py'.
+            If using tensorboard, save to '{work_dir}/tf_tracing_logs'
+            by default.
+        record_shapes (bool): Save information about operator's input shapes.
+            Defaults to False.
+        profile_memory (bool): Track tensor memory allocation/deallocation.
+            Defaults to False.
+        with_stack (bool): Record source information (file and line number)
+            for the ops. Defaults to False.
+        with_flops (bool): Use formula to estimate the FLOPS of specific
+            operators (matrix multiplication and 2D convolution).
+            Defaults to False.
+        json_trace_path (str, optional): Exports the collected trace in Chrome
+            JSON format. Chrome use 'chrome://tracing' view json file.
+            Defaults to None, which means profiling does not store json files.
+
+    Examples:
+        >>> # tensorboard trace
+        >>> trace_config = dict(type='tb_trace')
+        >>> profiler_hook_cfg = dict(on_trace_ready=trace_config)
+    """
+    priority = 'VERY_LOW'
+
+    def __init__(self,
+                 *,
+                 by_epoch: bool = True,
+                 profile_times: int = 1,
+                 activity_with_cpu: bool = True,
+                 activity_with_cuda: bool = False,
+                 schedule: Optional[dict] = None,
+                 on_trace_ready: Union[Callable, dict, None] = None,
+                 record_shapes: bool = False,
+                 profile_memory: bool = False,
+                 with_stack: bool = False,
+                 with_flops: bool = False,
+                 json_trace_path: Optional[str] = None) -> None:
+
+        try:
+            from torch import profiler
+        except ImportError:
+            raise ImportError('please upgrade torch above 1.8.1')
+        if not check_kineto():
+            raise ImportError('Due to Kineto support issues, please upgrade '
+                              'pytorch above 1.8.1(windows users above 1.9.1)')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
+        self.by_epoch = by_epoch
+
+        if profile_times < 1:
+            raise ValueError('profile_iters should be greater than 0, '
+                             f'but got {profile_times}')
+        if by_epoch and profile_times > 1:
+            raise ValueError(
+                f'Profiler will profile 0-{profile_times} epochs.\n'
+                'Since profiler will slow down the training, it is recommended'
+                ' to train 1 epoch with ProfilerHook and adjust your setting '
+                'according to the profiler summary.\n'
+                'During normal training(epoch > 1), '
+                'you may disable the ProfilerHook.')
+        self.profile_times = profile_times
+
+        assert isinstance(activity_with_cpu, bool), \
+            '``activity_with_cpu`` should be a boolean.'
+        assert isinstance(activity_with_cuda, bool), \
+            '``activity_with_cuda`` should be a boolean.'
+        self.activities = []
+        if activity_with_cpu:
+            self.activities.append(profiler.ProfilerActivity.CPU)
+        if activity_with_cuda:
+            self.activities.append(profiler.ProfilerActivity.CUDA)
+
+        if schedule is not None:
+            assert isinstance(schedule, dict), '``schedule`` should be a dict.'
+            self.schedule = profiler.schedule(**schedule)
+        else:
+            self.schedule = None
+
+        self.on_trace_ready = on_trace_ready
+        self.record_shapes = record_shapes
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_flops = with_flops
+
+        self.json_trace_path = json_trace_path
+
+    @master_only
+    def before_run(self, runner):
+        """Initialize the profiler.
+
+        Through the runner parameter, the validity of the parameter is further
+        determined.
+        """
+        max_times = runner.max_epochs if self.by_epoch else runner.max_iters
+        if max_times < self.profile_times:
+            raise ValueError(
+                f'``profile_times`` should not be greater than {max_times}')
+
+        on_trace_ready = self._parse_trace_config(runner)
+
+        self.profiler = torch.profiler.profile(  # noqa
+            activities=self.activities,
+            schedule=self.schedule,
+            on_trace_ready=on_trace_ready,
+            record_shapes=self.record_shapes,
+            profile_memory=self.profile_memory,
+            with_stack=self.with_stack,
+            with_flops=self.with_flops)
+
+        self.profiler.__enter__()
+        runner.logger.info('profiler is profiling...')
+
+    def _parse_trace_config(self, runner):
+        """Used to parse the parameter 'on_trace_ready'."""
+        if self.on_trace_ready is None:
+            _on_trace_ready = None
+        elif callable(self.on_trace_ready):
+            _on_trace_ready = self.on_trace_ready
+        elif isinstance(self.on_trace_ready, dict):
+            trace_cfg = self.on_trace_ready.copy()
+            trace_type = trace_cfg.pop('type')
+
+            # Build a log printing handle
+            if trace_type == 'log_trace':
+
+                def _log_handler(_profile):
+                    print(_profile.key_averages().table(**trace_cfg))
+
+                _on_trace_ready = _log_handler
+
+            elif trace_type == 'tb_trace':  # tensorboard_trace handler
+                try:
+                    import torch_tb_profiler  # noqa: F401
+                except ImportError:
+                    raise ImportError(
+                        'please run ``pip install torch-tb-profiler``')
+
+                if 'dir_name' not in trace_cfg:
+                    trace_cfg['dir_name'] = osp.join(runner.log_dir,
+                                                     'tf_tracing_logs')
+                elif not osp.isabs(trace_cfg['dir_name']):
+                    trace_cfg['dir_name'] = osp.join(runner.log_dir,
+                                                     trace_cfg['dir_name'])
+                runner.logger.info('trace_files of ProfilerHook will be '
+                                   f'saved to {trace_cfg["dir_name"]}.')
+
+                if self.json_trace_path is not None:
+                    runner.logger.warn(
+                        'When using tensorboard_trace, it is recommended to '
+                        'save json files by setting ``worker_name`` instead of'
+                        ' setting ``json_trace_path``')
+                _on_trace_ready = torch.profiler.tensorboard_trace_handler(
+                    **trace_cfg)
+            else:
+                raise ValueError('trace_type should be "log_trace" or '
+                                 f'"tb_trace", but got {trace_type}')
+        else:
+            raise ValueError(
+                '``on_trace_ready`` should be a handler, or dict, or None, '
+                f'but got {self.on_trace_ready}')
+        return _on_trace_ready
+
+    @master_only
+    def after_train_epoch(self, runner):
+        """Determine if the content is exported."""
+        if self.by_epoch and runner.epoch == self.profile_times - 1:
+            self._export_chrome_trace(runner)
+
+    @master_only
+    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+        """Update the content according to the schedule, and determine if the
+        content is exported."""
+        if self.schedule is None:
+            self.profiler.step()
+        if not self.by_epoch and runner.iter == self.profile_times - 1:
+            self._export_chrome_trace(runner)
+
+    def _export_chrome_trace(self, runner):
+        """Exporting content."""
+        runner.logger.info('profiler may take a few minutes...')
+        self.profiler.__exit__(None, None, None)
+        if self.json_trace_path is not None:
+            self.profiler.export_chrome_trace(self.json_trace_path)
diff --git a/tests/test_hooks/test_profiler_hook.py b/tests/test_hooks/test_profiler_hook.py
new file mode 100644
index 00000000..90af2300
--- /dev/null
+++ b/tests/test_hooks/test_profiler_hook.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import os.path as ops
+import unittest
+from unittest.mock import MagicMock
+
+import torch
+
+import mmengine.hooks
+from mmengine.hooks import ProfilerHook
+from mmengine.logging import MMLogger
+from mmengine.testing import RunnerTestCase
+from mmengine.utils import is_installed
+
+
+@unittest.skipIf(
+    not mmengine.hooks.profiler_hook.check_kineto(),
+    reason='Due to Kineto support issues, '
+    'please upgrade pytorch above 1.8.1 (windows users above 1.9.1)')
+class TestProfilerHook(RunnerTestCase):
+
+    def test_init(self):
+        # Test profile_times_args
+        ProfilerHook(by_epoch=False, profile_times=1)
+        with self.assertRaises(ValueError):
+            ProfilerHook(profile_times=0)
+        with self.assertRaises(ValueError):
+            ProfilerHook(by_epoch=True, profile_times=2)
+
+        # Test schedule_args
+        ProfilerHook(schedule=dict(wait=1, warmup=1, active=3, repeat=1))
+        with self.assertRaises(TypeError):
+            ProfilerHook(schedule=dict())
+
+    def test_parse_trace_config(self):
+        # Test on_trace_ready_args
+        runner = MagicMock()
+        hook = ProfilerHook(on_trace_ready=None)
+
+        hook.on_trace_ready = None
+        hook._parse_trace_config(runner)
+
+        def deal_profile(_profile):
+            pass
+
+        hook.on_trace_ready = deal_profile
+        hook._parse_trace_config(runner)
+
+        with self.assertRaises(ValueError):
+            hook.on_trace_ready = dict(type='unknown')
+            hook._parse_trace_config(runner)
+
+        hook.on_trace_ready = dict(
+            type='log_trace', sort_by='self_cpu_time_total', row_limit=10)
+        hook._parse_trace_config(runner)
+
+    @unittest.skipIf(
+        not is_installed('torch-tb-profiler'),
+        reason='required torch-tb-profiler')
+    def test_parse_trace_config_tensorboard(self):
+        # Test on_trace_ready_args
+        runner = MagicMock()
+        runner.log_dir = self.temp_dir.name
+        runner.logger = MMLogger.get_instance('test_profiler')
+        hook = ProfilerHook(on_trace_ready=None)
+
+        hook.on_trace_ready = dict(type='tb_trace')
+        hook._parse_trace_config(runner)
+
+        hook.on_trace_ready['dir_name'] = 'tb'
+        hook._parse_trace_config(runner)
+
+        hook.on_trace_ready['dir_name'] = ops.join(self.temp_dir.name, 'tb')
+        hook._parse_trace_config(runner)
+
+        # with self.assertWarns(DeprecationWarning):
+        hook = ProfilerHook(
+            on_trace_ready=dict(type='tb_trace'),
+            json_trace_path=ops.join(self.temp_dir.name, 'demo.json'))
+        hook._parse_trace_config(runner)
+
+        self.epoch_based_cfg['custom_hooks'] = [
+            dict(
+                type='ProfilerHook',
+                on_trace_ready=dict(
+                    type='tb_trace', dir_name='/home/baymax/RunTime/tb'))
+        ]
+        runner = self.build_runner(self.epoch_based_cfg)
+        runner.train()
+
+    def test_before_run(self):
+        runner = MagicMock()
+        runner.max_epochs = 1000
+        runner.max_iters = 10000
+        runner.logger = MMLogger.get_instance('test_profiler')
+
+        hook = ProfilerHook()
+        hook.before_run(runner)
+        hook.profiler.__exit__(None, None, None)
+
+        with self.assertRaises(ValueError):
+            hook = ProfilerHook(by_epoch=False, profile_times=10001)
+            hook.before_run(runner)
+            hook.profiler.__exit__(None, None, None)
+
+        with self.assertRaises(ValueError):
+            hook = ProfilerHook(by_epoch=True, profile_times=1001)
+            hook.before_run(runner)
+            hook.profiler.__exit__(None, None, None)
+
+    def test_export_chrome_trace(self):
+        runner = MagicMock()
+        runner.max_epochs = 1000
+        runner.logger = MMLogger.get_instance('test_profiler')
+
+        hook = ProfilerHook(
+            json_trace_path=ops.join(self.temp_dir.name, 'demo.json'))
+        hook.before_run(runner)
+        hook._export_chrome_trace(runner)
+
+    def test_after_train_epoch(self):
+        runner = MagicMock()
+        runner.max_epochs = 1000
+        runner.logger = MMLogger.get_instance('test_profiler')
+
+        runner.epoch = 0
+
+        hook = ProfilerHook()
+        hook.before_run(runner)
+        hook.profiler.__exit__(None, None, None)
+
+        hook.profiler = MagicMock()
+        hook.after_train_epoch(runner)
+        hook.profiler.__exit__.assert_called_once()
+
+    def test_after_train_iter(self):
+        runner = MagicMock()
+        runner.max_iters = 10000
+        runner.logger = MMLogger.get_instance('test_profiler')
+
+        runner.iter = 9
+
+        hook = ProfilerHook(by_epoch=False, profile_times=10, schedule=None)
+        hook.before_run(runner)
+        hook.profiler.__exit__(None, None, None)
+
+        hook.profiler = MagicMock()
+        hook.after_train_iter(runner, 1, 1, 1)
+        hook.profiler.__exit__.assert_called_once()
+        hook.profiler.step.assert_called_once()
+
+        hook = ProfilerHook(
+            by_epoch=False,
+            schedule=dict(wait=1, warmup=1, active=3, repeat=1))
+        hook.before_run(runner)
+        hook.profiler.__exit__(None, None, None)
+
+        hook.profiler = MagicMock()
+        hook.after_train_iter(runner, 1, 1, 1)
+        hook.profiler.step.assert_not_called()
+
+    def test_with_runner(self):
+        self.epoch_based_cfg['custom_hooks'] = [
+            dict(
+                type='ProfilerHook',
+                activity_with_cpu=False,
+                activity_with_cuda=False)
+        ]
+        runner = self.build_runner(self.epoch_based_cfg)
+        runner.train()
+
+        json_path = ops.join(self.temp_dir.name, 'demo.json')
+        self.epoch_based_cfg['custom_hooks'] = [
+            dict(type='ProfilerHook', json_trace_path=json_path)
+        ]
+        runner = self.build_runner(self.epoch_based_cfg)
+        runner.train()
+        self.assertTrue(
+            ops.exists(json_path), 'ERROR::json file is not generated!')
+
+        self.epoch_based_cfg['custom_hooks'] = [
+            dict(
+                type='ProfilerHook',
+                on_trace_ready=dict(
+                    type='log_trace',
+                    sort_by='self_cpu_time_total',
+                    row_limit=10))
+        ]
+        runner = self.build_runner(self.epoch_based_cfg)
+        runner.train()
+
+        with self.assertRaises(ValueError):
+            self.epoch_based_cfg['custom_hooks'] = [
+                dict(type='ProfilerHook', on_trace_ready=0)
+            ]
+            runner = self.build_runner(self.epoch_based_cfg)
+            runner.train()
+
+        if torch.cuda.is_available():
+            self.epoch_based_cfg['custom_hooks'] = [
+                dict(type='ProfilerHook', activity_with_cuda=True)
+            ]
+            runner = self.build_runner(self.epoch_based_cfg)
+            runner.train()