diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh index 4baddc40e..b80e7257c 100644 --- a/test_tipc/benchmark_train.sh +++ b/test_tipc/benchmark_train.sh @@ -69,8 +69,8 @@ PARAMS=$3 IFS=$'\n' # parser params from train_benchmark.txt sed -i 's/ -o DataLoader.Train.sampler.shuffle=False//g' $FILENAME -sed -i 's/ -o DataLoader.Train.loader.num_workers=0//g' $FILENAME -sed -i 's/-o DataLoader.Train.loader.use_shared_memory=False/-o Global.eval_during_train=False/g' $FILENAME +sed -i 's/ -o DataLoader.Train.loader.num_workers=0/ -o Global.print_batch_step=1/g' $FILENAME +sed -i 's/-o DataLoader.Train.loader.use_shared_memory=False/ -o Global.eval_during_train=False/g' $FILENAME dataline=`cat $FILENAME` # parser params IFS=$'\n' @@ -117,10 +117,14 @@ line_profile=13 line_eval_py=24 line_export_py=30 line_norm_train=16 +line_pact_train=17 +line_fgpm_train=18 func_sed_params "$FILENAME" "${line_eval_py}" "null" func_sed_params "$FILENAME" "${line_export_py}" "null" func_sed_params "$FILENAME" "${line_python}" "$python" +func_sed_params "$FILENAME" "${line_pact_train}" "null" +func_sed_params "$FILENAME" "${line_fgpm_train}" "null" # if params if [ ! -n "$PARAMS" ] ;then diff --git a/test_tipc/config/ShuffleNet/ShuffleNetV2_x1_0_train_infer_python.txt b/test_tipc/config/ShuffleNet/ShuffleNetV2_x1_0_train_infer_python.txt index f6e93afae..e19eef5e4 100644 --- a/test_tipc/config/ShuffleNet/ShuffleNetV2_x1_0_train_infer_python.txt +++ b/test_tipc/config/ShuffleNet/ShuffleNetV2_x1_0_train_infer_python.txt @@ -53,8 +53,8 @@ null:null ===========================train_benchmark_params========================== batch_size:256|1536 fp_items:fp32 -epoch:1 +epoch:2 --profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 ===========================infer_benchmark_params========================== -random_infer_input:[{float32,[3,224,224]}] \ No newline at end of file +random_infer_input:[{float32,[3,224,224]}] diff --git a/test_tipc/config/Twins/alt_gvt_base_train_infer_python.txt b/test_tipc/config/Twins/alt_gvt_base_train_infer_python.txt index 6dde91b4a..ef4b566c4 100644 --- a/test_tipc/config/Twins/alt_gvt_base_train_infer_python.txt +++ b/test_tipc/config/Twins/alt_gvt_base_train_infer_python.txt @@ -51,10 +51,10 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml null:null null:null ===========================train_benchmark_params========================== -batch_size:64|176 +batch_size:64|144 fp_items:fp32 epoch:1 --profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 ===========================infer_benchmark_params========================== -random_infer_input:[{float32,[3,224,224]}] \ No newline at end of file +random_infer_input:[{float32,[3,224,224]}] diff --git a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_SingleP_DP.sh b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_SingleP_DP.sh new file mode 100644 index 000000000..b338608ba --- /dev/null +++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp16_SingleP_DP.sh @@ -0,0 +1,13 @@ +model_item=ResNet50 +bs_item=256 +fp_item=fp16 +run_process_type=SingleP +run_mode=DP +device_num=N1C1 +max_epochs=1 +num_workers=8 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_SingleP_DP.sh b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_SingleP_DP.sh new file mode 100644 index 000000000..6bb64ef7e --- /dev/null +++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs256_fp32_SingleP_DP.sh @@ -0,0 +1,17 @@ +model_item=ResNet50 +bs_item=256 +fp_item=fp32 +run_process_type=SingleP +run_mode=DP +device_num=N1C1 +max_epochs=1 +num_workers=8 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; +# run profiling +sleep 10; +export PROFILING=true +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_SingleP_DP.sh b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_SingleP_DP.sh index 5189fc16b..c6d96b1f8 100644 --- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_SingleP_DP.sh +++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp16_SingleP_DP.sh @@ -5,7 +5,7 @@ run_process_type=SingleP run_mode=DP device_num=N1C1 max_epochs=1 -num_workers=4 +num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh diff --git a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh index a52d1be97..52ca473a4 100644 --- a/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh +++ b/test_tipc/static/ResNet50/N1C1/ResNet50_bs64_fp32_SingleP_DP.sh @@ -5,7 +5,7 @@ run_process_type=SingleP run_mode=DP device_num=N1C1 max_epochs=1 -num_workers=4 +num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh diff --git a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_MultiP_DP.sh b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_MultiP_DP.sh new file mode 100644 index 000000000..492764df5 --- /dev/null +++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp16_MultiP_DP.sh @@ -0,0 +1,13 @@ +model_item=ResNet50 +bs_item=256 +fp_item=fp16 +run_process_type=MultiP +run_mode=DP +device_num=N1C8 +max_epochs=1 +num_workers=8 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_MultiP_DP.sh b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_MultiP_DP.sh new file mode 100644 index 000000000..f4988432d --- /dev/null +++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs256_fp32_MultiP_DP.sh @@ -0,0 +1,13 @@ +model_item=ResNet50 +bs_item=256 +fp_item=fp32 +run_process_type=MultiP +run_mode=DP +device_num=N1C8 +max_epochs=1 +num_workers=8 + +# get data +bash test_tipc/static/${model_item}/benchmark_common/prepare.sh +# run +bash test_tipc/static/${model_item}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_process_type} ${run_mode} ${device_num} ${max_epochs} ${num_workers} 2>&1; diff --git a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_MultiP_DP.sh b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_MultiP_DP.sh index 0fe988a36..e42ae5068 100644 --- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_MultiP_DP.sh +++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp16_MultiP_DP.sh @@ -5,7 +5,7 @@ run_process_type=MultiP run_mode=DP device_num=N1C8 max_epochs=1 -num_workers=4 +num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh diff --git a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh index 2eca5d4ff..e1e1d76e3 100644 --- a/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh +++ b/test_tipc/static/ResNet50/N1C8/ResNet50_bs64_fp32_MultiP_DP.sh @@ -5,7 +5,7 @@ run_process_type=MultiP run_mode=DP device_num=N1C8 max_epochs=1 -num_workers=4 +num_workers=8 # get data bash test_tipc/static/${model_item}/benchmark_common/prepare.sh diff --git a/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh b/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh index df95ab506..2c3e4e1e3 100644 --- a/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh +++ b/test_tipc/static/ResNet50/benchmark_common/run_benchmark.sh @@ -46,7 +46,7 @@ function _train(){ log_file=${profiling_log_file} fi - train_cmd="${config_file} -o DataLoader.Train.sampler.batch_size=${base_batch_size} -o Global.epochs=${max_epochs} -o DataLoader.Train.loader.num_workers=${num_workers} ${profiling_config}" + train_cmd="${config_file} -o DataLoader.Train.sampler.batch_size=${base_batch_size} -o Global.epochs=${max_epochs} -o DataLoader.Train.loader.num_workers=${num_workers} ${profiling_config} -o Global.eval_during_train=False" # 以下为通用执行命令,无特殊可不用修改 case ${run_process_type} in SingleP) @@ -69,7 +69,19 @@ function _train(){ fi cd ../ } + +function _set_env(){ + #开启gc + export FLAGS_eager_delete_tensor_gb=0.0 + export FLAGS_fraction_of_gpu_memory_to_use=0.98 + #### + export FLAGS_cudnn_exhaustive_search=1 + export FLAGS_conv_workspace_size_limit=4000 #MB +} + + source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ # _train # 如果只产出训练log,不解析,可取消注释 +_set_env _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开