Enable profiler, for both static and dynamic training. (#729)
* Enable profiler for static training. * Polish the initialize of ProfilerOptions. * Enable profiler for dynamic mode.pull/739/head
parent
15168d25f3
commit
0d832a2539
|
@ -0,0 +1,111 @@
|
|||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import paddle
|
||||
|
||||
# A global variable to record the number of calling times for profiler
|
||||
# functions. It is used to specify the tracing range of training steps.
|
||||
_profiler_step_id = 0
|
||||
|
||||
# A global variable to avoid parsing from string every time.
|
||||
_profiler_options = None
|
||||
|
||||
|
||||
class ProfilerOptions(object):
|
||||
'''
|
||||
Use a string to initialize a ProfilerOptions.
|
||||
The string should be in the format: "key1=value1;key2=value;key3=value3".
|
||||
For example:
|
||||
"profile_path=model.profile"
|
||||
"batch_range=[50, 60]; profile_path=model.profile"
|
||||
"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
|
||||
|
||||
ProfilerOptions supports following key-value pair:
|
||||
batch_range - a integer list, e.g. [100, 110].
|
||||
state - a string, the optional values are 'CPU', 'GPU' or 'All'.
|
||||
sorted_key - a string, the optional values are 'calls', 'total',
|
||||
'max', 'min' or 'ave.
|
||||
tracer_option - a string, the optional values are 'Default', 'OpDetail',
|
||||
'AllOpDetail'.
|
||||
profile_path - a string, the path to save the serialized profile data,
|
||||
which can be used to generate a timeline.
|
||||
exit_on_finished - a boolean.
|
||||
'''
|
||||
|
||||
def __init__(self, options_str):
|
||||
assert isinstance(options_str, str)
|
||||
|
||||
self._options = {
|
||||
'batch_range': [10, 20],
|
||||
'state': 'All',
|
||||
'sorted_key': 'total',
|
||||
'tracer_option': 'Default',
|
||||
'profile_path': '/tmp/profile',
|
||||
'exit_on_finished': True
|
||||
}
|
||||
self._parse_from_string(options_str)
|
||||
|
||||
def _parse_from_string(self, options_str):
|
||||
for kv in options_str.replace(' ', '').split(';'):
|
||||
key, value = kv.split('=')
|
||||
if key == 'batch_range':
|
||||
value_list = value.replace('[', '').replace(']', '').split(',')
|
||||
value_list = list(map(int, value_list))
|
||||
if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
|
||||
1] > value_list[0]:
|
||||
self._options[key] = value_list
|
||||
elif key == 'exit_on_finished':
|
||||
self._options[key] = value.lower() in ("yes", "true", "t", "1")
|
||||
elif key in [
|
||||
'state', 'sorted_key', 'tracer_option', 'profile_path'
|
||||
]:
|
||||
self._options[key] = value
|
||||
|
||||
def __getitem__(self, name):
|
||||
if self._options.get(name, None) is None:
|
||||
raise ValueError(
|
||||
"ProfilerOptions does not have an option named %s." % name)
|
||||
return self._options[name]
|
||||
|
||||
|
||||
def add_profiler_step(options_str=None):
|
||||
'''
|
||||
Enable the operator-level timing using PaddlePaddle's profiler.
|
||||
The profiler uses a independent variable to count the profiler steps.
|
||||
One call of this function is treated as a profiler step.
|
||||
|
||||
Args:
|
||||
profiler_options - a string to initialize the ProfilerOptions.
|
||||
Default is None, and the profiler is disabled.
|
||||
'''
|
||||
if options_str is None:
|
||||
return
|
||||
|
||||
global _profiler_step_id
|
||||
global _profiler_options
|
||||
|
||||
if _profiler_options is None:
|
||||
_profiler_options = ProfilerOptions(options_str)
|
||||
|
||||
if _profiler_step_id == _profiler_options['batch_range'][0]:
|
||||
paddle.utils.profiler.start_profiler(
|
||||
_profiler_options['state'], _profiler_options['tracer_option'])
|
||||
elif _profiler_step_id == _profiler_options['batch_range'][1]:
|
||||
paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
|
||||
_profiler_options['profile_path'])
|
||||
if _profiler_options['exit_on_finished']:
|
||||
sys.exit(0)
|
||||
|
||||
_profiler_step_id += 1
|
|
@ -36,6 +36,7 @@ from ppcls.modeling.loss import JSDivLoss
|
|||
from ppcls.modeling.loss import GoogLeNetLoss
|
||||
from ppcls.utils.misc import AverageMeter
|
||||
from ppcls.utils import logger
|
||||
from ppcls.utils import profiler
|
||||
from ppcls.utils import multi_hot_encode
|
||||
from ppcls.utils import hamming_distance
|
||||
from ppcls.utils import accuracy_score
|
||||
|
@ -304,7 +305,8 @@ def run(dataloader,
|
|||
lr_scheduler=None,
|
||||
epoch=0,
|
||||
mode='train',
|
||||
vdl_writer=None):
|
||||
vdl_writer=None,
|
||||
profiler_options=None):
|
||||
"""
|
||||
Feed data to the model and fetch the measures and loss
|
||||
|
||||
|
@ -359,6 +361,8 @@ def run(dataloader,
|
|||
metric_list["batch_time"].reset()
|
||||
metric_list["reader_time"].reset()
|
||||
|
||||
profiler.add_profiler_step(profiler_options)
|
||||
|
||||
metric_list['reader_time'].update(time.time() - tic)
|
||||
batch_size = len(batch[0])
|
||||
feeds = create_feeds(batch, use_mix, classes_num, multilabel)
|
||||
|
|
|
@ -33,7 +33,7 @@ from ppcls.modeling.loss import MixCELoss
|
|||
from ppcls.modeling.loss import JSDivLoss
|
||||
from ppcls.modeling.loss import GoogLeNetLoss
|
||||
from ppcls.utils.misc import AverageMeter
|
||||
from ppcls.utils import logger
|
||||
from ppcls.utils import logger, profiler
|
||||
|
||||
from paddle.distributed import fleet
|
||||
from paddle.distributed.fleet import DistributedStrategy
|
||||
|
@ -465,7 +465,8 @@ def run(dataloader,
|
|||
mode='train',
|
||||
config=None,
|
||||
vdl_writer=None,
|
||||
lr_scheduler=None):
|
||||
lr_scheduler=None,
|
||||
profiler_options=None):
|
||||
"""
|
||||
Feed data to the model and fetch the measures and loss
|
||||
|
||||
|
@ -525,6 +526,8 @@ def run(dataloader,
|
|||
|
||||
metric_list['reader_time'].update(time.time() - tic)
|
||||
|
||||
profiler.add_profiler_step(profiler_options)
|
||||
|
||||
if use_dali:
|
||||
batch_size = batch[0]["feed_image"].shape()[0]
|
||||
feed_dict = batch[0]
|
||||
|
|
|
@ -48,6 +48,13 @@ def parse_args():
|
|||
type=str,
|
||||
default=None,
|
||||
help='VisualDL logging directory for image.')
|
||||
parser.add_argument(
|
||||
'-p',
|
||||
'--profiler_options',
|
||||
type=str,
|
||||
default=None,
|
||||
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o',
|
||||
'--override',
|
||||
|
@ -157,7 +164,7 @@ def main(args):
|
|||
# 1. train with train dataset
|
||||
program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
|
||||
train_fetchs, epoch_id, 'train', config, vdl_writer,
|
||||
lr_scheduler)
|
||||
lr_scheduler, args.profiler_options)
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
# 2. validate with validate dataset
|
||||
if config.validate and epoch_id % config.valid_interval == 0:
|
||||
|
|
|
@ -40,6 +40,13 @@ def parse_args():
|
|||
type=str,
|
||||
default='configs/ResNet/ResNet50.yaml',
|
||||
help='config file path')
|
||||
parser.add_argument(
|
||||
'-p',
|
||||
'--profiler_options',
|
||||
type=str,
|
||||
default=None,
|
||||
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o',
|
||||
'--override',
|
||||
|
@ -116,7 +123,8 @@ def main(args):
|
|||
net.train()
|
||||
# 1. train with train dataset
|
||||
program.run(train_dataloader, config, dp_net, optimizer,
|
||||
lr_scheduler, epoch_id, 'train', vdl_writer)
|
||||
lr_scheduler, epoch_id, 'train', vdl_writer,
|
||||
args.profiler_options)
|
||||
|
||||
# 2. validate with validate dataset
|
||||
if config.validate and epoch_id % config.valid_interval == 0:
|
||||
|
|
Loading…
Reference in New Issue