Enable profiler, for both static and dynamic training. (#729)

* Enable profiler for static training.

* Polish the initialize of ProfilerOptions.

* Enable profiler for dynamic mode.
pull/739/head
Yiqun Liu 2021-05-21 10:31:43 +08:00 committed by GitHub
parent 15168d25f3
commit 0d832a2539
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 138 additions and 5 deletions

View File

@ -0,0 +1,111 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import paddle
# A global variable to record the number of calling times for profiler
# functions. It is used to specify the tracing range of training steps.
_profiler_step_id = 0
# A global variable to avoid parsing from string every time.
_profiler_options = None
class ProfilerOptions(object):
'''
Use a string to initialize a ProfilerOptions.
The string should be in the format: "key1=value1;key2=value;key3=value3".
For example:
"profile_path=model.profile"
"batch_range=[50, 60]; profile_path=model.profile"
"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
ProfilerOptions supports following key-value pair:
batch_range - a integer list, e.g. [100, 110].
state - a string, the optional values are 'CPU', 'GPU' or 'All'.
sorted_key - a string, the optional values are 'calls', 'total',
'max', 'min' or 'ave.
tracer_option - a string, the optional values are 'Default', 'OpDetail',
'AllOpDetail'.
profile_path - a string, the path to save the serialized profile data,
which can be used to generate a timeline.
exit_on_finished - a boolean.
'''
def __init__(self, options_str):
assert isinstance(options_str, str)
self._options = {
'batch_range': [10, 20],
'state': 'All',
'sorted_key': 'total',
'tracer_option': 'Default',
'profile_path': '/tmp/profile',
'exit_on_finished': True
}
self._parse_from_string(options_str)
def _parse_from_string(self, options_str):
for kv in options_str.replace(' ', '').split(';'):
key, value = kv.split('=')
if key == 'batch_range':
value_list = value.replace('[', '').replace(']', '').split(',')
value_list = list(map(int, value_list))
if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
1] > value_list[0]:
self._options[key] = value_list
elif key == 'exit_on_finished':
self._options[key] = value.lower() in ("yes", "true", "t", "1")
elif key in [
'state', 'sorted_key', 'tracer_option', 'profile_path'
]:
self._options[key] = value
def __getitem__(self, name):
if self._options.get(name, None) is None:
raise ValueError(
"ProfilerOptions does not have an option named %s." % name)
return self._options[name]
def add_profiler_step(options_str=None):
'''
Enable the operator-level timing using PaddlePaddle's profiler.
The profiler uses a independent variable to count the profiler steps.
One call of this function is treated as a profiler step.
Args:
profiler_options - a string to initialize the ProfilerOptions.
Default is None, and the profiler is disabled.
'''
if options_str is None:
return
global _profiler_step_id
global _profiler_options
if _profiler_options is None:
_profiler_options = ProfilerOptions(options_str)
if _profiler_step_id == _profiler_options['batch_range'][0]:
paddle.utils.profiler.start_profiler(
_profiler_options['state'], _profiler_options['tracer_option'])
elif _profiler_step_id == _profiler_options['batch_range'][1]:
paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
_profiler_options['profile_path'])
if _profiler_options['exit_on_finished']:
sys.exit(0)
_profiler_step_id += 1

View File

@ -36,6 +36,7 @@ from ppcls.modeling.loss import JSDivLoss
from ppcls.modeling.loss import GoogLeNetLoss
from ppcls.utils.misc import AverageMeter
from ppcls.utils import logger
from ppcls.utils import profiler
from ppcls.utils import multi_hot_encode
from ppcls.utils import hamming_distance
from ppcls.utils import accuracy_score
@ -304,7 +305,8 @@ def run(dataloader,
lr_scheduler=None,
epoch=0,
mode='train',
vdl_writer=None):
vdl_writer=None,
profiler_options=None):
"""
Feed data to the model and fetch the measures and loss
@ -359,6 +361,8 @@ def run(dataloader,
metric_list["batch_time"].reset()
metric_list["reader_time"].reset()
profiler.add_profiler_step(profiler_options)
metric_list['reader_time'].update(time.time() - tic)
batch_size = len(batch[0])
feeds = create_feeds(batch, use_mix, classes_num, multilabel)

View File

@ -33,7 +33,7 @@ from ppcls.modeling.loss import MixCELoss
from ppcls.modeling.loss import JSDivLoss
from ppcls.modeling.loss import GoogLeNetLoss
from ppcls.utils.misc import AverageMeter
from ppcls.utils import logger
from ppcls.utils import logger, profiler
from paddle.distributed import fleet
from paddle.distributed.fleet import DistributedStrategy
@ -465,7 +465,8 @@ def run(dataloader,
mode='train',
config=None,
vdl_writer=None,
lr_scheduler=None):
lr_scheduler=None,
profiler_options=None):
"""
Feed data to the model and fetch the measures and loss
@ -525,6 +526,8 @@ def run(dataloader,
metric_list['reader_time'].update(time.time() - tic)
profiler.add_profiler_step(profiler_options)
if use_dali:
batch_size = batch[0]["feed_image"].shape()[0]
feed_dict = batch[0]

View File

@ -48,6 +48,13 @@ def parse_args():
type=str,
default=None,
help='VisualDL logging directory for image.')
parser.add_argument(
'-p',
'--profiler_options',
type=str,
default=None,
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
)
parser.add_argument(
'-o',
'--override',
@ -157,7 +164,7 @@ def main(args):
# 1. train with train dataset
program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
train_fetchs, epoch_id, 'train', config, vdl_writer,
lr_scheduler)
lr_scheduler, args.profiler_options)
if paddle.distributed.get_rank() == 0:
# 2. validate with validate dataset
if config.validate and epoch_id % config.valid_interval == 0:

View File

@ -40,6 +40,13 @@ def parse_args():
type=str,
default='configs/ResNet/ResNet50.yaml',
help='config file path')
parser.add_argument(
'-p',
'--profiler_options',
type=str,
default=None,
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
)
parser.add_argument(
'-o',
'--override',
@ -116,7 +123,8 @@ def main(args):
net.train()
# 1. train with train dataset
program.run(train_dataloader, config, dp_net, optimizer,
lr_scheduler, epoch_id, 'train', vdl_writer)
lr_scheduler, epoch_id, 'train', vdl_writer,
args.profiler_options)
# 2. validate with validate dataset
if config.validate and epoch_id % config.valid_interval == 0: