mirror of
https://github.com/PaddlePaddle/PaddleClas.git
synced 2025-06-03 21:55:06 +08:00
commit
eee3ca70fb
@ -142,7 +142,6 @@ else
|
|||||||
batch_size=${params_list[1]}
|
batch_size=${params_list[1]}
|
||||||
batch_size=`echo ${batch_size} | tr -cd "[0-9]" `
|
batch_size=`echo ${batch_size} | tr -cd "[0-9]" `
|
||||||
precision=${params_list[2]}
|
precision=${params_list[2]}
|
||||||
# run_process_type=${params_list[3]}
|
|
||||||
run_mode=${params_list[3]}
|
run_mode=${params_list[3]}
|
||||||
device_num=${params_list[4]}
|
device_num=${params_list[4]}
|
||||||
IFS=";"
|
IFS=";"
|
||||||
@ -167,10 +166,9 @@ for batch_size in ${batch_size_list[*]}; do
|
|||||||
gpu_id=$(set_gpu_id $device_num)
|
gpu_id=$(set_gpu_id $device_num)
|
||||||
|
|
||||||
if [ ${#gpu_id} -le 1 ];then
|
if [ ${#gpu_id} -le 1 ];then
|
||||||
run_process_type="SingleP"
|
|
||||||
log_path="$SAVE_LOG/profiling_log"
|
log_path="$SAVE_LOG/profiling_log"
|
||||||
mkdir -p $log_path
|
mkdir -p $log_path
|
||||||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling"
|
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling"
|
||||||
func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id
|
func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id
|
||||||
# set profile_option params
|
# set profile_option params
|
||||||
tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
|
tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
|
||||||
@ -186,8 +184,8 @@ for batch_size in ${batch_size_list[*]}; do
|
|||||||
speed_log_path="$SAVE_LOG/index"
|
speed_log_path="$SAVE_LOG/index"
|
||||||
mkdir -p $log_path
|
mkdir -p $log_path
|
||||||
mkdir -p $speed_log_path
|
mkdir -p $speed_log_path
|
||||||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
|
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
|
||||||
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
|
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
|
||||||
func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null
|
func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null
|
||||||
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
|
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
|
||||||
echo $cmd
|
echo $cmd
|
||||||
@ -198,13 +196,12 @@ for batch_size in ${batch_size_list[*]}; do
|
|||||||
eval "cat ${log_path}/${log_name}"
|
eval "cat ${log_path}/${log_name}"
|
||||||
|
|
||||||
# parser log
|
# parser log
|
||||||
_model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
|
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
|
||||||
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
|
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
|
||||||
--speed_log_file '${speed_log_path}/${speed_log_name}' \
|
--speed_log_file '${speed_log_path}/${speed_log_name}' \
|
||||||
--model_name ${_model_name} \
|
--model_name ${_model_name} \
|
||||||
--base_batch_size ${batch_size} \
|
--base_batch_size ${batch_size} \
|
||||||
--run_mode ${run_mode} \
|
--run_mode ${run_mode} \
|
||||||
--run_process_type ${run_process_type} \
|
|
||||||
--fp_item ${precision} \
|
--fp_item ${precision} \
|
||||||
--keyword ips: \
|
--keyword ips: \
|
||||||
--skip_steps 2 \
|
--skip_steps 2 \
|
||||||
@ -218,13 +215,12 @@ for batch_size in ${batch_size_list[*]}; do
|
|||||||
else
|
else
|
||||||
IFS=";"
|
IFS=";"
|
||||||
unset_env=`unset CUDA_VISIBLE_DEVICES`
|
unset_env=`unset CUDA_VISIBLE_DEVICES`
|
||||||
run_process_type="MultiP"
|
|
||||||
log_path="$SAVE_LOG/train_log"
|
log_path="$SAVE_LOG/train_log"
|
||||||
speed_log_path="$SAVE_LOG/index"
|
speed_log_path="$SAVE_LOG/index"
|
||||||
mkdir -p $log_path
|
mkdir -p $log_path
|
||||||
mkdir -p $speed_log_path
|
mkdir -p $speed_log_path
|
||||||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
|
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
|
||||||
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
|
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
|
||||||
func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id
|
func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id
|
||||||
func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null
|
func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null
|
||||||
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
|
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
|
||||||
@ -235,14 +231,13 @@ for batch_size in ${batch_size_list[*]}; do
|
|||||||
export model_run_time=$((${job_et}-${job_bt}))
|
export model_run_time=$((${job_et}-${job_bt}))
|
||||||
eval "cat ${log_path}/${log_name}"
|
eval "cat ${log_path}/${log_name}"
|
||||||
# parser log
|
# parser log
|
||||||
_model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
|
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
|
||||||
|
|
||||||
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
|
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
|
||||||
--speed_log_file '${speed_log_path}/${speed_log_name}' \
|
--speed_log_file '${speed_log_path}/${speed_log_name}' \
|
||||||
--model_name ${_model_name} \
|
--model_name ${_model_name} \
|
||||||
--base_batch_size ${batch_size} \
|
--base_batch_size ${batch_size} \
|
||||||
--run_mode ${run_mode} \
|
--run_mode ${run_mode} \
|
||||||
--run_process_type ${run_process_type} \
|
|
||||||
--fp_item ${precision} \
|
--fp_item ${precision} \
|
||||||
--keyword ips: \
|
--keyword ips: \
|
||||||
--skip_steps 2 \
|
--skip_steps 2 \
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
model_item=ResNet50
|
model_item=ResNet50
|
||||||
bs_item=256
|
bs_item=256
|
||||||
fp_item=fp16
|
fp_item=fp16
|
||||||
run_process_type=SingleP
|
|
||||||
run_mode=DP
|
run_mode=DP
|
||||||
device_num=N1C1
|
device_num=N1C1
|
||||||
max_epochs=1
|
max_epochs=1
|
||||||
@ -10,4 +9,4 @@ num_workers=8
|
|||||||
# get data
|
# get data
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||||
# run
|
# run
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
@ -1,7 +1,6 @@
|
|||||||
model_item=ResNet50
|
model_item=ResNet50
|
||||||
bs_item=256
|
bs_item=256
|
||||||
fp_item=fp32
|
fp_item=fp32
|
||||||
run_process_type=SingleP
|
|
||||||
run_mode=DP
|
run_mode=DP
|
||||||
device_num=N1C1
|
device_num=N1C1
|
||||||
max_epochs=1
|
max_epochs=1
|
||||||
@ -10,8 +9,8 @@ num_workers=8
|
|||||||
# get data
|
# get data
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||||
# run
|
# run
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
||||||
# run profiling
|
# run profiling
|
||||||
sleep 10;
|
sleep 10;
|
||||||
export PROFILING=true
|
export PROFILING=true
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
@ -1,7 +1,6 @@
|
|||||||
model_item=ResNet50
|
model_item=ResNet50
|
||||||
bs_item=64
|
bs_item=64
|
||||||
fp_item=fp16
|
fp_item=fp16
|
||||||
run_process_type=SingleP
|
|
||||||
run_mode=DP
|
run_mode=DP
|
||||||
device_num=N1C1
|
device_num=N1C1
|
||||||
max_epochs=1
|
max_epochs=1
|
||||||
@ -10,4 +9,4 @@ num_workers=8
|
|||||||
# get data
|
# get data
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||||
# run
|
# run
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
@ -1,7 +1,6 @@
|
|||||||
model_item=ResNet50
|
model_item=ResNet50
|
||||||
bs_item=64
|
bs_item=64
|
||||||
fp_item=fp32
|
fp_item=fp32
|
||||||
run_process_type=SingleP
|
|
||||||
run_mode=DP
|
run_mode=DP
|
||||||
device_num=N1C1
|
device_num=N1C1
|
||||||
max_epochs=1
|
max_epochs=1
|
||||||
@ -10,8 +9,8 @@ num_workers=8
|
|||||||
# get data
|
# get data
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||||
# run
|
# run
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
||||||
# run profiling
|
# run profiling
|
||||||
sleep 10;
|
sleep 10;
|
||||||
export PROFILING=true
|
export PROFILING=true
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
@ -1,7 +1,6 @@
|
|||||||
model_item=ResNet50
|
model_item=ResNet50
|
||||||
bs_item=256
|
bs_item=256
|
||||||
fp_item=fp16
|
fp_item=fp16
|
||||||
run_process_type=MultiP
|
|
||||||
run_mode=DP
|
run_mode=DP
|
||||||
device_num=N1C8
|
device_num=N1C8
|
||||||
max_epochs=1
|
max_epochs=1
|
||||||
@ -10,4 +9,4 @@ num_workers=8
|
|||||||
# get data
|
# get data
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||||
# run
|
# run
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
@ -1,7 +1,6 @@
|
|||||||
model_item=ResNet50
|
model_item=ResNet50
|
||||||
bs_item=256
|
bs_item=256
|
||||||
fp_item=fp32
|
fp_item=fp32
|
||||||
run_process_type=MultiP
|
|
||||||
run_mode=DP
|
run_mode=DP
|
||||||
device_num=N1C8
|
device_num=N1C8
|
||||||
max_epochs=1
|
max_epochs=1
|
||||||
@ -10,4 +9,4 @@ num_workers=8
|
|||||||
# get data
|
# get data
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||||
# run
|
# run
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
@ -1,7 +1,6 @@
|
|||||||
model_item=ResNet50
|
model_item=ResNet50
|
||||||
bs_item=64
|
bs_item=64
|
||||||
fp_item=fp16
|
fp_item=fp16
|
||||||
run_process_type=MultiP
|
|
||||||
run_mode=DP
|
run_mode=DP
|
||||||
device_num=N1C8
|
device_num=N1C8
|
||||||
max_epochs=1
|
max_epochs=1
|
||||||
@ -10,4 +9,4 @@ num_workers=8
|
|||||||
# get data
|
# get data
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||||
# run
|
# run
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
@ -1,7 +1,6 @@
|
|||||||
model_item=ResNet50
|
model_item=ResNet50
|
||||||
bs_item=64
|
bs_item=64
|
||||||
fp_item=fp32
|
fp_item=fp32
|
||||||
run_process_type=MultiP
|
|
||||||
run_mode=DP
|
run_mode=DP
|
||||||
device_num=N1C8
|
device_num=N1C8
|
||||||
max_epochs=1
|
max_epochs=1
|
||||||
@ -10,4 +9,4 @@ num_workers=8
|
|||||||
# get data
|
# get data
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
bash test_tipc/static/${model_item}/benchmark_common/prepare.sh
|
||||||
# run
|
# run
|
||||||
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1;
|
@ -1,23 +1,22 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# Test training benchmark for a model.
|
# Test training benchmark for a model.
|
||||||
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num}
|
# Usage:bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
|
||||||
function _set_params(){
|
function _set_params(){
|
||||||
model_item=${1:-"model_item"} # (必选) 模型 item
|
model_item=${1:-"model_item"} # (必选) 模型 item
|
||||||
base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
|
base_batch_size=${2:-"2"} # (必选) 如果是静态图单进程,则表示每张卡上的BS,需在训练时*卡数
|
||||||
fp_item=${3:-"fp32"} # (必选) fp32|fp16
|
fp_item=${3:-"fp32"} # (必选) fp32|fp16
|
||||||
run_process_type=${4:-"SingleP"} # (必选) 单进程 SingleP|多进程 MultiP
|
run_mode=${4:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
|
||||||
run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
|
device_num=${5:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
|
||||||
device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
|
|
||||||
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
|
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
|
||||||
model_repo="PaddleClas" # (必选) 模型套件的名字
|
model_repo="PaddleClas" # (必选) 模型套件的名字
|
||||||
speed_unit="samples/sec" # (必选)速度指标单位
|
speed_unit="samples/sec" # (必选)速度指标单位
|
||||||
skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step
|
skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step
|
||||||
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
|
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
|
||||||
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
|
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
|
||||||
max_epochs=${7:-"1"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
|
max_epochs=${6:-"1"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
|
||||||
num_workers=${8:-"4"} # (可选)
|
num_workers=${7:-"4"} # (可选)
|
||||||
# 以下为通用执行命令,无特殊可不用修改
|
# 以下为通用执行命令,无特殊可不用修改
|
||||||
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_process_type}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
|
model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
|
||||||
device=${CUDA_VISIBLE_DEVICES//,/ }
|
device=${CUDA_VISIBLE_DEVICES//,/ }
|
||||||
arr=(${device})
|
arr=(${device})
|
||||||
num_gpu_devices=${#arr[*]}
|
num_gpu_devices=${#arr[*]}
|
||||||
@ -48,13 +47,19 @@ function _train(){
|
|||||||
|
|
||||||
train_cmd="${config_file} -o DataLoader.Train.sampler.batch_size=${base_batch_size} -o Global.epochs=${max_epochs} -o DataLoader.Train.loader.num_workers=${num_workers} ${profiling_config} -o Global.eval_during_train=False"
|
train_cmd="${config_file} -o DataLoader.Train.sampler.batch_size=${base_batch_size} -o Global.epochs=${max_epochs} -o DataLoader.Train.loader.num_workers=${num_workers} ${profiling_config} -o Global.eval_during_train=False"
|
||||||
# 以下为通用执行命令,无特殊可不用修改
|
# 以下为通用执行命令,无特殊可不用修改
|
||||||
case ${run_process_type} in
|
case ${run_mode} in
|
||||||
SingleP)
|
DP) if [[ ${device_num} = "N1C1" ]];then
|
||||||
train_cmd="python ppcls/static/train.py ${train_cmd}";;
|
echo "run ${run_mode} ${device_num}"
|
||||||
MultiP)
|
train_cmd="python ppcls/static/train.py ${train_cmd}"
|
||||||
train_cmd="python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 ppcls/static/train.py ${train_cmd}";;
|
else
|
||||||
*) echo "choose run_process_type(SingleP or MultiP)"; exit 1;
|
rm -rf ./mylog
|
||||||
|
train_cmd="python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 ppcls/static/train.py ${train_cmd}"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
DP1-MP1-PP1) echo "run run_mode: DP1-MP1-PP1" ;;
|
||||||
|
*) echo "choose run_mode "; exit 1;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
|
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
|
||||||
timeout 5m ${train_cmd} > ${log_file} 2>&1
|
timeout 5m ${train_cmd} > ${log_file} 2>&1
|
||||||
if [ $? -ne 0 ];then
|
if [ $? -ne 0 ];then
|
||||||
@ -63,7 +68,7 @@ function _train(){
|
|||||||
echo -e "${model_name}, SUCCESS"
|
echo -e "${model_name}, SUCCESS"
|
||||||
fi
|
fi
|
||||||
# kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
|
# kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
|
||||||
if [ ${run_process_type} = "MultiP" -a -d mylog ]; then
|
if [ ${device_num} != "N1C1" -a -d mylog ]; then
|
||||||
rm ${log_file}
|
rm ${log_file}
|
||||||
cp mylog/workerlog.0 ${log_file}
|
cp mylog/workerlog.0 ${log_file}
|
||||||
fi
|
fi
|
||||||
|
Loading…
x
Reference in New Issue
Block a user