Enable profiler, for both static and dynamic training. (#729)

* Enable profiler for static training. * Polish the initialize of ProfilerOptions. * Enable profiler for dynamic mode.
2025-06-03 21:55:06 +08:00 · 2021-05-21 10:31:43 +08:00 · 2021-05-21 10:31:43 +08:00 · 0d832a2539
commit 0d832a2539
parent 15168d25f3
5 changed files with 138 additions and 5 deletions
--- a/ppcls/utils/profiler.py
+++ b/ppcls/utils/profiler.py
@ -0,0 +1,111 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(
+            _profiler_options['state'], _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
--- a/tools/program.py
+++ b/tools/program.py
@ -36,6 +36,7 @@ from ppcls.modeling.loss import JSDivLoss
 from ppcls.modeling.loss import GoogLeNetLoss
 from ppcls.utils.misc import AverageMeter
 from ppcls.utils import logger
+from ppcls.utils import profiler
 from ppcls.utils import multi_hot_encode
 from ppcls.utils import hamming_distance
 from ppcls.utils import accuracy_score
@ -304,7 +305,8 @@ def run(dataloader,
        lr_scheduler=None,
        epoch=0,
        mode='train',
-        vdl_writer=None):
+        vdl_writer=None,
+        profiler_options=None):
    """
    Feed data to the model and fetch the measures and loss

@ -359,6 +361,8 @@ def run(dataloader,
            metric_list["batch_time"].reset()
            metric_list["reader_time"].reset()

+        profiler.add_profiler_step(profiler_options)
+
        metric_list['reader_time'].update(time.time() - tic)
        batch_size = len(batch[0])
        feeds = create_feeds(batch, use_mix, classes_num, multilabel)
--- a/tools/static/program.py
+++ b/tools/static/program.py
@ -33,7 +33,7 @@ from ppcls.modeling.loss import MixCELoss
 from ppcls.modeling.loss import JSDivLoss
 from ppcls.modeling.loss import GoogLeNetLoss
 from ppcls.utils.misc import AverageMeter
-from ppcls.utils import logger
+from ppcls.utils import logger, profiler

 from paddle.distributed import fleet
 from paddle.distributed.fleet import DistributedStrategy
@ -465,7 +465,8 @@ def run(dataloader,
        mode='train',
        config=None,
        vdl_writer=None,
-        lr_scheduler=None):
+        lr_scheduler=None,
+        profiler_options=None):
    """
    Feed data to the model and fetch the measures and loss

@ -525,6 +526,8 @@ def run(dataloader,

        metric_list['reader_time'].update(time.time() - tic)

+        profiler.add_profiler_step(profiler_options)
+
        if use_dali:
            batch_size = batch[0]["feed_image"].shape()[0]
            feed_dict = batch[0]
--- a/tools/static/train.py
+++ b/tools/static/train.py
@ -48,6 +48,13 @@ def parse_args():
        type=str,
        default=None,
        help='VisualDL logging directory for image.')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
    parser.add_argument(
        '-o',
        '--override',
@ -157,7 +164,7 @@ def main(args):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
                    train_fetchs, epoch_id, 'train', config, vdl_writer,
-                    lr_scheduler)
+                    lr_scheduler, args.profiler_options)
        if paddle.distributed.get_rank() == 0:
            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
--- a/tools/train.py
+++ b/tools/train.py
@ -40,6 +40,13 @@ def parse_args():
        type=str,
        default='configs/ResNet/ResNet50.yaml',
        help='config file path')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
    parser.add_argument(
        '-o',
        '--override',
@ -116,7 +123,8 @@ def main(args):
            net.train()
            # 1. train with train dataset
            program.run(train_dataloader, config, dp_net, optimizer,
-                        lr_scheduler, epoch_id, 'train', vdl_writer)
+                        lr_scheduler, epoch_id, 'train', vdl_writer,
+                        args.profiler_options)

            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0: