add static graph ResNet50 for benchmark
parent
fb986162de
commit
33e8e8348d
ppcls/static
test_tipc/static
|
@ -406,7 +406,7 @@ def run(dataloader,
|
|||
if "time" in key else str(metric_dict[key].value)
|
||||
for key in metric_dict
|
||||
])
|
||||
ips_info = " ips: {:.5f} images/sec.".format(
|
||||
ips_info = " ips: {:.5f} samples/sec.".format(
|
||||
batch_size / metric_dict["batch_time"].avg)
|
||||
fetchs_str += ips_info
|
||||
|
||||
|
@ -433,8 +433,8 @@ def run(dataloader,
|
|||
|
||||
end_str = ' '.join([str(m.mean) for m in metric_dict.values()] +
|
||||
[metric_dict["batch_time"].total])
|
||||
ips_info = "ips: {:.5f} images/sec.".format(batch_size /
|
||||
metric_dict["batch_time"].avg)
|
||||
ips_info = "ips: {:.5f} samples/sec.".format(batch_size /
|
||||
metric_dict["batch_time"].avg)
|
||||
if mode == 'eval':
|
||||
logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
|
||||
else:
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
# PaddleClas 下静态图benchmark模型执行说明
|
||||
|
||||
静态图benchmark测试脚本说明
|
||||
|
||||
# 目录说明
|
||||
# Docker 运行环境
|
||||
docker image: registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.2-cudnn8-gcc82
|
||||
paddle = 2.2.2
|
||||
python = 3.7
|
||||
# 运行benchmark测试步骤
|
||||
|
||||
```shell
|
||||
git clone https://github.com/PaddlePaddle/PaddleClas.git
|
||||
cd PaddleClas
|
||||
```
|
||||
|
||||
# 准备数据
|
||||
|
||||
```shell
|
||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||
```
|
||||
|
||||
# 运行模型
|
||||
## 单卡(自动运行打开Profiling)
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
bash test_tipc/static/${model_item}/N1C1/${shell_name}.sh
|
||||
```
|
||||
|
||||
## 多卡
|
||||
|
||||
```shell
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
bash test_tipc/static/${model_item}/N1C8/${shell_name}.sh
|
||||
```
|
|
@ -0,0 +1,13 @@
|
|||
model_item=ResNet50
|
||||
bs_item=64
|
||||
fp_item=fp16
|
||||
run_process_type=SingleP
|
||||
run_mode=DP
|
||||
device_num=N1C1
|
||||
max_epochs=1
|
||||
num_workers=4
|
||||
|
||||
# get data
|
||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||
# run
|
||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
|
@ -0,0 +1,17 @@
|
|||
model_item=ResNet50
|
||||
bs_item=64
|
||||
fp_item=fp32
|
||||
run_process_type=SingleP
|
||||
run_mode=DP
|
||||
device_num=N1C1
|
||||
max_epochs=1
|
||||
num_workers=4
|
||||
|
||||
# get data
|
||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||
# run
|
||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
||||
# run profiling
|
||||
sleep 10;
|
||||
export PROFILING=true
|
||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
|
@ -0,0 +1,13 @@
|
|||
model_item=ResNet50
|
||||
bs_item=64
|
||||
fp_item=fp16
|
||||
run_process_type=MultiP
|
||||
run_mode=DP
|
||||
device_num=N1C8
|
||||
max_epochs=1
|
||||
num_workers=4
|
||||
|
||||
# get data
|
||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||
# run
|
||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
|
@ -0,0 +1,13 @@
|
|||
model_item=ResNet50
|
||||
bs_item=64
|
||||
fp_item=fp32
|
||||
run_process_type=MultiP
|
||||
run_mode=DP
|
||||
device_num=N1C8
|
||||
max_epochs=1
|
||||
num_workers=4
|
||||
|
||||
# get data
|
||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||
# run
|
||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/bash
|
||||
|
||||
pip install -r requirements.txt
|
||||
cd dataset
|
||||
rm -rf ILSVRC2012
|
||||
wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/ImageNet1k/ILSVRC2012_val.tar
|
||||
tar xf ILSVRC2012_val.tar
|
||||
ln -s ILSVRC2012_val ILSVRC2012
|
||||
cd ILSVRC2012
|
||||
ln -s val_list.txt train_list.txt
|
||||
cd ../../
|
|
@ -0,0 +1,75 @@
|
|||
#!/usr/bin/env bash
|
||||
# Test training benchmark for a model.
|
||||
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
|
||||
function _set_params(){
|
||||
model_item=${1:-"model_item"} # (必选) 模型 item
|
||||
base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
|
||||
fp_item=${3:-"fp32"} # (必选) fp32|fp16
|
||||
run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
|
||||
run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
|
||||
device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
|
||||
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
|
||||
model_repo="PaddleClas" # (必选) 模型套件的名字
|
||||
speed_unit="samples/sec" # (必选)速度指标单位
|
||||
skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step
|
||||
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
|
||||
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
|
||||
max_epochs=${7:-"1"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
|
||||
num_workers=${8:-"4"} # (可选)
|
||||
# 以下为通用执行命令,无特殊可不用修改
|
||||
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
|
||||
device=${CUDA_VISIBLE_DEVICES//,/ }
|
||||
arr=(${device})
|
||||
num_gpu_devices=${#arr[*]}
|
||||
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量
|
||||
profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
|
||||
speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
|
||||
|
||||
train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
|
||||
profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
|
||||
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
|
||||
}
|
||||
function _train(){
|
||||
batch_size=${base_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
|
||||
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
|
||||
|
||||
if [ ${fp_item} = "fp32" ]; then
|
||||
config_file="-c ppcls/configs/ImageNet/ResNet/ResNet50.yaml"
|
||||
else
|
||||
config_file="-c ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml"
|
||||
fi
|
||||
if [ ${profiling} = "false" ]; then
|
||||
profiling_config=""
|
||||
log_file=${train_log_file}
|
||||
else
|
||||
profiling_config="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
|
||||
log_file=${profiling_log_file}
|
||||
fi
|
||||
|
||||
train_cmd="${config_file} -o DataLoader.Train.sampler.batch_size=${base_batch_size} -o Global.epochs=${max_epochs} -o DataLoader.Train.loader.num_workers=${num_workers} ${profiling_config}"
|
||||
# 以下为通用执行命令,无特殊可不用修改
|
||||
case ${run_process_type} in
|
||||
SingleP)
|
||||
train_cmd="python ppcls/static/train.py ${train_cmd}";;
|
||||
MultiP)
|
||||
train_cmd="python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 ppcls/static/train.py ${train_cmd}";;
|
||||
*) echo "choose run_process_type(SingleP or MultiP)"; exit 1;
|
||||
esac
|
||||
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
|
||||
timeout 5m ${train_cmd} > ${log_file} 2>&1
|
||||
if [ $? -ne 0 ];then
|
||||
echo -e "${model_name}, FAIL"
|
||||
else
|
||||
echo -e "${model_name}, SUCCESS"
|
||||
fi
|
||||
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
|
||||
if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
|
||||
rm ${log_file}
|
||||
cp mylog/workerlog.0 ${log_file}
|
||||
fi
|
||||
cd ../
|
||||
}
|
||||
# source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
|
||||
_set_params $@
|
||||
_train # 如果只产出训练log,不解析,可取消注释
|
||||
# _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开
|
Loading…
Reference in New Issue