[Feature] Add NPUProfilerHook to profile performance in Ascend device (#925)

* Feature NPUProfilerHook * Feature NPUProfilerHook
2023-02-21 17:20:40 +08:00 · 2023-02-21 17:20:40 +08:00 · 3dc2be05d5
parent e16dacf7e3
commit 3dc2be05d5
5 changed files with 179 additions and 3 deletions
--- a/docs/en/api/hooks.rst
+++ b/docs/en/api/hooks.rst
@ -23,4 +23,5 @@ mmengine.hooks
   SyncBuffersHook
   EmptyCacheHook
   ProfilerHook
+   NPUProfilerHook
   PrepareTTAHook
--- a/docs/zh_cn/api/hooks.rst
+++ b/docs/zh_cn/api/hooks.rst
@ -23,4 +23,5 @@ mmengine.hooks
   SyncBuffersHook
   EmptyCacheHook
   ProfilerHook
+   NPUProfilerHook
   PrepareTTAHook
--- a/mmengine/hooks/init.py
+++ b/mmengine/hooks/init.py
@ -7,7 +7,7 @@ from .iter_timer_hook import IterTimerHook
 from .logger_hook import LoggerHook
 from .naive_visualization_hook import NaiveVisualizationHook
 from .param_scheduler_hook import ParamSchedulerHook
-from .profiler_hook import ProfilerHook
+from .profiler_hook import NPUProfilerHook, ProfilerHook
 from .runtime_info_hook import RuntimeInfoHook
 from .sampler_seed_hook import DistSamplerSeedHook
 from .sync_buffer_hook import SyncBuffersHook
@ -17,5 +17,5 @@ __all__ = [
    'Hook', 'IterTimerHook', 'DistSamplerSeedHook', 'ParamSchedulerHook',
    'SyncBuffersHook', 'EmptyCacheHook', 'CheckpointHook', 'LoggerHook',
    'NaiveVisualizationHook', 'EMAHook', 'RuntimeInfoHook', 'ProfilerHook',
-    'PrepareTTAHook'
+    'NPUProfilerHook', 'PrepareTTAHook'
 ]
--- a/mmengine/hooks/profiler_hook.py
+++ b/mmengine/hooks/profiler_hook.py
@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os
 import os.path as osp
+import sys
 import warnings
 from typing import Callable, Optional, Union

@ -230,3 +232,103 @@ class ProfilerHook(Hook):
        self.profiler.__exit__(None, None, None)
        if self.json_trace_path is not None:
            self.profiler.export_chrome_trace(self.json_trace_path)
+
+
+@HOOKS.register_module()
+class NPUProfilerHook(Hook):
+    """NPUProfiler to analyze performance during training.
+
+    NPU Profiling is used to count the device execution time of all operators.
+    The torch_npu.npu.profile interface is used to complete the profiling data
+    collection at each stage of the project, and the data is analyzed by the
+    msprof tool and the data can be dumped to further manually analyze the
+    key performance bottlenecks. For more details on the torch_npu.npu.profile
+    interface, please visit
+    https://gitee.com/ascend/pytorch/blob/master/torch_npu/npu/profiler.py#profile
+
+    Args:
+        begin (int): Number of start iterations for profiling. Defaults to 0.
+        end (int): Number of end iterations for profiling. Defaults to 1.
+        result_path (str): The path to save the profiling results file.
+            Defaults to 'cann_profiling'.
+        exit_after_profiling (bool): Whether to exit the program after
+            profiling. Defaults to True.
+        use_e2e_profiler (bool): Turn on E2E profiling, E2E profiling combines
+            performance data at the Pytorch level and the NPU level to analyze
+            the bottlenecks of model performance end-to-end, and cannot show
+            detailed content, and only as an auxiliary analysis.
+            Defaults to False.
+        ge_profiling_to_std_out (bool): Turn on GE profiling, GE uses to
+            collect the profiling data of the host side scheduling of the
+            Assend device. Defaults to False.
+
+    Examples:
+        >>> cfg = ...
+        >>> profiler_config = dict(type='NPUProfilerHook', end=2)
+        >>> cfg.merge_from_dict({'custom_hooks': custom_hooks})
+        >>> runner = Runner.from_cfg(cfg)
+        >>> runner.train()
+    """
+    priority = 'VERY_LOW'
+
+    def __init__(self,
+                 *,
+                 begin: int = 0,
+                 end: int = 1,
+                 result_path: str = 'cann_profiling',
+                 exit_after_profiling: bool = True,
+                 use_e2e_profiler: bool = False,
+                 ge_profiling_to_std_out: bool = False):
+
+        try:
+            import torch_npu
+        except ImportError:
+            raise ImportError('Failed to import torch_npu module')
+
+        if begin >= end:
+            raise ValueError(
+                'The iteration to start profiling should not be greater'
+                'than or equal to profile end')
+
+        self.begin = begin
+        self.end = end
+        self.result_path = result_path
+        self.exit_after_profiling = exit_after_profiling
+
+        if ge_profiling_to_std_out:
+            os.environ['GE_PROFILING_TO_STD_OUT'] = '1'
+
+        if not osp.exists(self.result_path):
+            os.makedirs(self.result_path, exist_ok=True)
+
+        self.profiler = torch_npu.npu.profile(
+            self.result_path, use_e2e_profiler=use_e2e_profiler)
+
+    @master_only
+    def before_run(self, runner):
+
+        if self.end > runner.max_iters:
+            raise ValueError(
+                'The profiling end iteration should not be greater'
+                'than the max iteration')
+
+    @master_only
+    def before_train_iter(self, runner, batch_idx, data_batch=None):
+
+        if runner.iter == self.begin:
+            self.profiler.__enter__()
+            runner.logger.info('NPUProfiler starts profiling...')
+
+    @master_only
+    def after_train_iter(self,
+                         runner,
+                         batch_idx,
+                         data_batch=None,
+                         outputs=None):
+
+        if runner.iter == self.end - 1:
+            runner.logger.info('profiler may take a few minutes to'
+                               ' save the profiling result.')
+            self.profiler.__exit__(None, None, None)
+            if self.exit_after_profiling:
+                sys.exit()
--- a/tests/test_hooks/test_profiler_hook.py
+++ b/tests/test_hooks/test_profiler_hook.py
@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.

+import os
 import os.path as ops
 import unittest
 from unittest.mock import MagicMock
@ -7,7 +8,8 @@ from unittest.mock import MagicMock
 import torch

 import mmengine.hooks
-from mmengine.hooks import ProfilerHook
+from mmengine.device import is_npu_available
+from mmengine.hooks import NPUProfilerHook, ProfilerHook
 from mmengine.logging import MMLogger
 from mmengine.testing import RunnerTestCase
 from mmengine.utils import is_installed
@ -202,3 +204,73 @@ class TestProfilerHook(RunnerTestCase):
            ]
            runner = self.build_runner(self.epoch_based_cfg)
            runner.train()
+
+
+@unittest.skipIf(
+    not is_npu_available(), reason='Ascend PyTorch and npu devices not exist')
+class TestNPUProfilerHook(RunnerTestCase):
+
+    def test_init(self):
+
+        result_path = ops.join(self.temp_dir.name, 'test/cann_profiling')
+
+        NPUProfilerHook(result_path=result_path)
+
+        with self.assertRaises(ValueError):
+            NPUProfilerHook(begin=1, end=0, result_path=result_path)
+
+    def test_before_run(self):
+        result_path = ops.join(self.temp_dir.name, 'test/cann_profiling')
+        runner = MagicMock()
+        runner.max_iters = 1
+        runner.logger = MMLogger.get_instance('test_npu_profiler')
+
+        hook = NPUProfilerHook(result_path=result_path)
+        hook.before_run(runner)
+
+        with self.assertRaises(ValueError):
+            hook = NPUProfilerHook(begin=0, end=10, result_path=result_path)
+            hook.before_run(runner)
+
+    def test_after_train_iter(self):
+        result_path = ops.join(self.temp_dir.name, 'test/cann_profiling')
+        runner = MagicMock()
+        runner.max_iters = 10000
+        runner.logger = MMLogger.get_instance('test_npu_profiler')
+
+        runner.iter = 0
+
+        hook = NPUProfilerHook(begin=0, end=10, result_path=result_path)
+        hook.before_run(runner)
+
+        hook.profiler = MagicMock()
+        hook.after_train_iter(runner, 1)
+
+    def test_with_runner(self):
+        result_path = ops.join(self.temp_dir.name, 'test/cann_profiling')
+        self.epoch_based_cfg['custom_hooks'] = [
+            dict(
+                type='NPUProfilerHook',
+                begin=0,
+                result_path=result_path,
+                exit_after_profiling=False)
+        ]
+        runner = self.build_runner(self.epoch_based_cfg)
+        runner.train()
+
+        self.epoch_based_cfg['custom_hooks'] = [
+            dict(
+                type='NPUProfilerHook',
+                result_path=result_path,
+                ge_profiling_to_std_out=True,
+                exit_after_profiling=False)
+        ]
+        runner = self.build_runner(self.epoch_based_cfg)
+        runner.train()
+
+        self.assertTrue(
+            ops.exists(result_path), 'profiler result path is not generated!')
+
+        self.assertTrue(
+            os.getenv('GE_PROFILING_TO_STD_OUT', '0') == '1',
+            'GE PROFILING failed to start!')