diff --git a/test_tipc/README.md b/test_tipc/README.md index 9fd89cd79..e7765a861 100644 --- a/test_tipc/README.md +++ b/test_tipc/README.md @@ -112,3 +112,4 @@ bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/MobileNetV3/Mo - [test_lite_arm_cpu_cpp 使用](docs/test_lite_arm_cpu_cpp.md): 测试基于Paddle-Lite的ARM CPU端c++预测部署功能. - [test_paddle2onnx 使用](docs/test_paddle2onnx.md):测试Paddle2ONNX的模型转化功能,并验证正确性。 - [test_serving_infer_python 使用](docs/test_serving_infer_python.md):测试python serving功能。 +- [test_train_fleet_inference_python 使用](./docs/test_train_fleet_inference_python.md):测试基于Python的多机多卡训练与推理等基本功能。 diff --git a/test_tipc/config/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_train_fleet_infer_python.txt b/test_tipc/config/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_train_fleet_infer_python.txt new file mode 100644 index 000000000..eb67f0b96 --- /dev/null +++ b/test_tipc/config/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_train_fleet_infer_python.txt @@ -0,0 +1,54 @@ +===========================train_params=========================== +model_name:GeneralRecognition_PPLCNet_x2_5 +python:python3.7 +gpu_list:192.168.0.1,192.168.0.2;0,1 +-o Global.device:gpu +-o Global.auto_cast:null +-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120 +-o Global.output_dir:./output/ +-o DataLoader.Train.sampler.batch_size:8 +-o Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./dataset/ILSVRC2012/val +null:null +## +trainer:norm_train +norm_train:tools/train.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml +null:null +## +===========================infer_params========================== +-o Global.save_inference_dir:./inference +-o Global.pretrained_model: +norm_export:tools/export_model.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml +quant_export:null +fpgm_export:null +distill_export:null +kl_quant:null +export2:null +pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/general_PPLCNet_x2_5_pretrained_v1.0.pdparams +infer_model:../inference/ +infer_export:True +infer_quant:Fasle +inference:python/predict_rec.py -c configs/inference_rec.yaml +-o Global.use_gpu:True|False +-o Global.enable_mkldnn:True|False +-o Global.cpu_num_threads:1|6 +-o Global.batch_size:1|16 +-o Global.use_tensorrt:True|False +-o Global.use_fp16:True|False +-o Global.rec_inference_model_dir:../inference +-o Global.infer_imgs:../dataset/Aliproduct/demo_test/ +-o Global.save_log_path:null +-o Global.benchmark:True +null:null +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] \ No newline at end of file diff --git a/test_tipc/config/PPHGNet/PPHGNet_small_fleet_train_infer_python.txt b/test_tipc/config/PPHGNet/PPHGNet_small_fleet_train_infer_python.txt new file mode 100644 index 000000000..4118dca88 --- /dev/null +++ b/test_tipc/config/PPHGNet/PPHGNet_small_fleet_train_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:PPHGNet_small +python:python3.7 +gpu_list:192.168.0.1,192.168.0.2;0,1 +-o Global.device:gpu +-o Global.auto_cast:null +-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120 +-o Global.output_dir:./output/ +-o DataLoader.Train.sampler.batch_size:8 +-o Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./dataset/ILSVRC2012/val +null:null +## +trainer:norm_train +norm_train:tools/train.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml +null:null +## +===========================infer_params========================== +-o Global.save_inference_dir:./inference +-o Global.pretrained_model: +norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml +quant_export:null +fpgm_export:null +distill_export:null +kl_quant:null +export2:null +pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNet_small_pretrained.pdparams +infer_model:../inference/ +infer_export:True +infer_quant:Fasle +inference:python/predict_cls.py -c configs/inference_cls.yaml -o PreProcess.transform_ops.0.ResizeImage.resize_short=236 +-o Global.use_gpu:True|False +-o Global.enable_mkldnn:True|False +-o Global.cpu_num_threads:1|6 +-o Global.batch_size:1|16 +-o Global.use_tensorrt:True|False +-o Global.use_fp16:True|False +-o Global.inference_model_dir:../inference +-o Global.infer_imgs:../dataset/ILSVRC2012/val +-o Global.save_log_path:null +-o Global.benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] diff --git a/test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt b/test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt new file mode 100644 index 000000000..191e36c6b --- /dev/null +++ b/test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:PPLCNet_x1_0 +python:python3.7 +gpu_list:192.168.0.1,192.168.0.2;0,1 +-o Global.device:gpu +-o Global.auto_cast:null +-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120 +-o Global.output_dir:./output/ +-o DataLoader.Train.sampler.batch_size:8 +-o Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./dataset/ILSVRC2012/val +null:null +## +trainer:norm_train +norm_train:tools/train.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml +null:null +## +===========================infer_params========================== +-o Global.save_inference_dir:./inference +-o Global.pretrained_model: +norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml +quant_export:null +fpgm_export:null +distill_export:null +kl_quant:null +export2:null +pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams +infer_model:../inference/ +infer_export:True +infer_quant:Fasle +inference:python/predict_cls.py -c configs/inference_cls.yaml +-o Global.use_gpu:True|False +-o Global.enable_mkldnn:True|False +-o Global.cpu_num_threads:1|6 +-o Global.batch_size:1|16 +-o Global.use_tensorrt:True|False +-o Global.use_fp16:True|False +-o Global.inference_model_dir:../inference +-o Global.infer_imgs:../dataset/ILSVRC2012/val +-o Global.save_log_path:null +-o Global.benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] \ No newline at end of file diff --git a/test_tipc/config/PPLCNetV2/PPLCNetV2_base_fleet_train_infer_python.txt b/test_tipc/config/PPLCNetV2/PPLCNetV2_base_fleet_train_infer_python.txt new file mode 100644 index 000000000..f115d3903 --- /dev/null +++ b/test_tipc/config/PPLCNetV2/PPLCNetV2_base_fleet_train_infer_python.txt @@ -0,0 +1,53 @@ +===========================train_params=========================== +model_name:PPLCNetV2_base +python:python3.7 +gpu_list:192.168.0.1,192.168.0.2;0,1 +-o Global.device:gpu +-o Global.auto_cast:null +-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120 +-o Global.output_dir:./output/ +-o DataLoader.Train.sampler.first_bs:8 +-o Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./dataset/ILSVRC2012/val +null:null +## +trainer:norm_train +norm_train:tools/train.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml -o Global.seed=1234 -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml +null:null +## +===========================infer_params========================== +-o Global.save_inference_dir:./inference +-o Global.pretrained_model: +norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml +quant_export:null +fpgm_export:null +distill_export:null +kl_quant:null +export2:null +pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_pretrained.pdparams +infer_model:../inference/ +infer_export:True +infer_quant:Fasle +inference:python/predict_cls.py -c configs/inference_cls.yaml +-o Global.use_gpu:True|False +-o Global.enable_mkldnn:True|False +-o Global.cpu_num_threads:1|6 +-o Global.batch_size:1|16 +-o Global.use_tensorrt:True|False +-o Global.use_fp16:True|False +-o Global.inference_model_dir:../inference +-o Global.infer_imgs:../dataset/ILSVRC2012/val +-o Global.save_log_path:null +-o Global.benchmark:True +null:null +===========================infer_benchmark_params========================== +random_infer_input:[{float32,[3,224,224]}] diff --git a/test_tipc/docs/test_train_fleet_inference_python.md b/test_tipc/docs/test_train_fleet_inference_python.md new file mode 100644 index 000000000..d1d743065 --- /dev/null +++ b/test_tipc/docs/test_train_fleet_inference_python.md @@ -0,0 +1,121 @@ +# Linux GPU/CPU 多机多卡训练推理测试 + +Linux GPU/CPU 多机多卡训练推理测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的多机多卡模型训练、评估、推理等基本功能。 + +## 1. 测试结论汇总 + +- 训练相关: + + | 算法名称 | 模型名称 | 多机多卡 | + | :-------: | :-----------------: | :--------: | + | PPLCNet | PPLCNet_x1_0 | 分布式训练 | + | PPLCNetV2 | PPLCNetV2_base | 分布式训练 | + | PPHGNet | PPHGNet_small | 分布式训练 | + | PP-ShiTu | PPShiTu_general_rec | 分布式训练 | + + +- 推理相关: + + | 算法名称 | 模型名称 | device_CPU | device_GPU | batchsize | + | :-------: | :-----------------: | :--------: | :--------: | :-------: | + | PPLCNet | PPLCNet_x1_0 | 支持 | 支持 | 1 | + | PPLCNetV2 | PPLCNetV2_base | 支持 | 支持 | 1 | + | PPHGNet | PPHGNet_small | 支持 | 支持 | 1 | + | PP-ShiTu | PPShiTu_general_rec | 支持 | 支持 | 1 | + + +## 2. 测试流程 + +运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。 + +**下面以 PPLCNet_x1_0 模型为例,介绍测试流程** + +### 2.1 功能测试 + +#### 2.1.1 修改配置文件 + +首先,修改配置文件`test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt`中的`gpu_list`设置:假设两台机器的`ip`地址分别为`192.168.0.1`和`192.168.0.2`,则对应的配置文件`gpu_list`字段需要修改为`gpu_list:192.168.0.1,192.168.0.2;0,1`。 + +**`ip`地址查看命令为`ifconfig`,在`inet addr:`字段后的即为ip地址**。 + + +#### 2.1.2 准备数据 + +运行`prepare.sh`准备数据和模型,数据准备命令如下所示。 + +```shell +bash test_tipc/prepare.sh test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt lite_train_lite_infer +``` + +**注意:** 由于是多机训练,这里需要在所有节点上都运行一次启动上述命令来准备数据。 + +#### 2.1.3 修改起始端口开始测试 + +在多机的节点上使用下面的命令设置分布式的起始端口(否则后面运行的时候会由于无法找到运行端口而hang住),一般建议设置在`10000~20000`之间。 + +```shell +export FLAGS_START_PORT=17000 +``` +**注意:** 上述修改起始端口命令同样需要在所有节点上都执行一次。 + +接下来就可以开始执行测试,命令如下所示。 +```shell +bash test_tipc/test_train_inference_python.sh test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt +``` + +**注意:** 由于是多机训练,这里需要在所有的节点上均运行启动上述命令进行测试。 + + +#### 2.1.4 输出结果 + +输出结果保存在`test_tipc/output/PPLCNet_x1_0/results_python.log`,内容如下,以`Run successfully`开头表示测试命令正常,否则为测试失败。 + +```bash +Run successfully with command - python3.7 -m paddle.distributed.launch --ips=192.168.0.1,192.168.0.2 --gpus=0,1 tools/train.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml -o Global.seed=1234 -o DataL +oader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o Global.device=gpu -o Global.output_dir=./test_tipc/output/PPLCNet_x1_0/norm_train_gpus_0, +1_autocast_null_nodes_2 -o Global.epochs=2 -o DataLoader.Train.sampler.batch_size=8 ! +... +... +Run successfully with command - python3.7 python/predict_cls.py -c configs/inference_cls.yaml -o Global.use_gpu=False -o Global.enable_mkldnn=True -o Global.cpu_num_threads=1 -o Global.inference_model_dir=.././t +est_tipc/output/PPLCNet_x1_0/norm_train_gpus_0,1_autocast_null_nodes_2 -o Global.batch_size=16 -o Global.infer_imgs=../dataset/ILSVRC2012/val -o Global.benchmark=True > .././test_tipc/output/PPLCNet_x1_0/infer_cpu_us +emkldnn_True_threads_1_batchsize_16.log 2>&1 ! +``` + +在配置文件中默认设置`-o Global.benchmark:True`表示开启benchmark选项,此时可以得到测试的详细数据,包含运行环境信息(系统版本、CUDA版本、CUDNN版本、驱动版本),Paddle版本信息,参数设置信息(运行设备、线程数、是否开启内存优化等),模型信息(模型名称、精度),数据信息(batchsize、是否为动态shape等),性能信息(CPU,GPU的占用、运行耗时、预处理耗时、推理耗时、后处理耗时),内容如下所示: + +```log +[2022/06/07 17:01:41] root INFO: ---------------------- Env info ---------------------- +[2022/06/07 17:01:41] root INFO: OS_version: CentOS 6.10 +[2022/06/07 17:01:41] root INFO: CUDA_version: 10.1.243 +[2022/06/07 17:01:41] root INFO: CUDNN_version: None.None.None +[2022/06/07 17:01:41] root INFO: drivier_version: 460.32.03 +[2022/06/07 17:01:41] root INFO: ---------------------- Paddle info ---------------------- +[2022/06/07 17:01:41] root INFO: paddle_version: 2.3.0-rc0 +[2022/06/07 17:01:41] root INFO: paddle_version: 2.3.0-rc0 +[2022/06/07 17:01:41] root INFO: paddle_commit: 5d4980c052583fec022812d9c29460aff7cdc18b +[2022/06/07 17:01:41] root INFO: log_api_version: 1.0 +[2022/06/07 17:01:41] root INFO: ----------------------- Conf info ----------------------- +[2022/06/07 17:01:41] root INFO: runtime_device: cpu +[2022/06/07 17:01:41] root INFO: ir_optim: True +[2022/06/07 17:01:41] root INFO: enable_memory_optim: True +[2022/06/07 17:01:41] root INFO: enable_tensorrt: False +[2022/06/07 17:01:41] root INFO: enable_mkldnn: False +[2022/06/07 17:01:41] root INFO: cpu_math_library_num_threads: 6 +[2022/06/07 17:01:41] root INFO: ----------------------- Model info ---------------------- +[2022/06/07 17:01:41] root INFO: model_name: cls +[2022/06/07 17:01:41] root INFO: precision: fp32 +[2022/06/07 17:01:41] root INFO: ----------------------- Data info ----------------------- +[2022/06/07 17:01:41] root INFO: batch_size: 16 +[2022/06/07 17:01:41] root INFO: input_shape: [3, 224, 224] +[2022/06/07 17:01:41] root INFO: data_num: 3 +[2022/06/07 17:01:41] root INFO: ----------------------- Perf info ----------------------- +[2022/06/07 17:01:41] root INFO: cpu_rss(MB): 726.5586, gpu_rss(MB): None, gpu_util: None% +[2022/06/07 17:01:41] root INFO: total time spent(s): 0.3527 +[2022/06/07 17:01:41] root INFO: preprocess_time(ms): 33.2723, inference_time(ms): 317.9824, postprocess_time(ms): 1.4579 +``` + +该信息可以在运行log中查看,log位置在`test_tipc/output/PPLCNet_x1_0/infer_gpu_usetrt_True_precision_True_batchsize_1.log`。 + +如果运行失败,也会在终端中输出运行失败的日志信息以及对应的运行命令。可以基于该命令,分析运行失败的原因。 + +**注意:** 由于分布式训练时,仅在`trainer_id=0`所在的节点中保存模型,因此其他的节点中在运行模型导出与推理时会因为找不到保存的模型而报错,为正常现象。 diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh index 6553a99a8..d6c90af52 100644 --- a/test_tipc/prepare.sh +++ b/test_tipc/prepare.sh @@ -43,7 +43,7 @@ function func_get_url_file_name() { model_name=$(func_parser_value "${lines[1]}") -if [ ${MODE} = "cpp_infer" ]; then +if [[ ${MODE} = "cpp_infer" ]]; then if [ -d "./deploy/cpp/opencv-3.4.7/opencv3/" ] && [ $(md5sum ./deploy/cpp/opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ]; then echo "################### build opencv skipped ###################" else @@ -151,7 +151,7 @@ if [[ $FILENAME == *use_dali* ]]; then ${python_name} -m pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-nightly-cuda102 fi -if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer" ]; then +if [[ ${MODE} = "lite_train_lite_infer" ]] || [[ ${MODE} = "lite_train_whole_infer" ]]; then # pretrain lite train data cd dataset rm -rf ILSVRC2012 @@ -163,7 +163,7 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer" mv val.txt val_list.txt cp -r train/* val/ cd ../../ -elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then +elif [[ ${MODE} = "whole_infer" ]] || [[ ${MODE} = "klquant_whole_infer" ]]; then # download data cd dataset rm -rf ILSVRC2012 @@ -185,7 +185,7 @@ elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then eval $cmd fi -elif [ ${MODE} = "whole_train_whole_infer" ]; then +elif [[ ${MODE} = "whole_train_whole_infer" ]]; then cd dataset rm -rf ILSVRC2012 wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_CIFAR100.tar @@ -197,7 +197,7 @@ elif [ ${MODE} = "whole_train_whole_infer" ]; then cd ../../ fi -if [ ${MODE} = "serving_infer" ]; then +if [[ ${MODE} = "serving_infer" ]]; then # prepare serving env python_name=$(func_parser_value "${lines[2]}") ${python_name} -m pip install install paddle-serving-server-gpu==0.7.0.post102 @@ -225,7 +225,7 @@ if [ ${MODE} = "serving_infer" ]; then unset https_proxy fi -if [ ${MODE} = "paddle2onnx_infer" ]; then +if [[ ${MODE} = "paddle2onnx_infer" ]]; then # prepare paddle2onnx env python_name=$(func_parser_value "${lines[2]}") inference_model_url=$(func_parser_value "${lines[10]}") @@ -241,7 +241,7 @@ if [ ${MODE} = "paddle2onnx_infer" ]; then cd ../../ fi -if [ ${MODE} = "benchmark_train" ]; then +if [[ ${MODE} = "benchmark_train" ]]; then pip install -r requirements.txt cd dataset rm -rf ILSVRC2012 diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh index a567ef3c6..2f2b3b65a 100644 --- a/test_tipc/test_train_inference_python.sh +++ b/test_tipc/test_train_inference_python.sh @@ -60,12 +60,12 @@ kl_quant_cmd_value=$(func_parser_value "${lines[33]}") export_key2=$(func_parser_key "${lines[34]}") export_value2=$(func_parser_value "${lines[34]}") -# parser inference model +# parser inference model infer_model_dir_list=$(func_parser_value "${lines[36]}") infer_export_flag=$(func_parser_value "${lines[37]}") infer_is_quant=$(func_parser_value "${lines[38]}") -# parser inference +# parser inference inference_py=$(func_parser_value "${lines[39]}") use_gpu_key=$(func_parser_key "${lines[40]}") use_gpu_list=$(func_parser_value "${lines[40]}") @@ -90,7 +90,7 @@ infer_value1=$(func_parser_value "${lines[50]}") if [ ! $epoch_num ]; then epoch_num=2 fi -if [ $MODE = 'benchmark_train' ]; then +if [[ $MODE = 'benchmark_train' ]]; then epoch_num=1 fi @@ -106,7 +106,7 @@ function func_inference(){ _log_path=$4 _img_dir=$5 _flag_quant=$6 - # inference + # inference for use_gpu in ${use_gpu_list[*]}; do if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then for use_mkldnn in ${use_mkldnn_list[*]}; do @@ -161,7 +161,7 @@ function func_inference(){ done } -if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then +if [[ ${MODE} = "whole_infer" ]] || [[ ${MODE} = "klquant_whole_infer" ]]; then IFS="|" infer_export_flag=(${infer_export_flag}) if [ ${infer_export_flag} != "null" ] && [ ${infer_export_flag} != "False" ]; then @@ -171,7 +171,7 @@ if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then fi fi -if [ ${MODE} = "whole_infer" ]; then +if [[ ${MODE} = "whole_infer" ]]; then GPUID=$3 if [ ${#GPUID} -le 0 ];then env=" " @@ -191,7 +191,7 @@ if [ ${MODE} = "whole_infer" ]; then done cd .. -elif [ ${MODE} = "klquant_whole_infer" ]; then +elif [[ ${MODE} = "klquant_whole_infer" ]]; then # for kl_quant if [ ${kl_quant_cmd_value} != "null" ] && [ ${kl_quant_cmd_value} != "False" ]; then echo "kl_quant" @@ -234,7 +234,7 @@ else env=" " fi for autocast in ${autocast_list[*]}; do - for trainer in ${trainer_list[*]}; do + for trainer in ${trainer_list[*]}; do flag_quant=False if [ ${trainer} = ${pact_key} ]; then run_train=${pact_trainer} @@ -263,14 +263,16 @@ else if [ ${run_train} = "null" ]; then continue fi - + set_autocast=$(func_set_params "${autocast_key}" "${autocast}") set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}") set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}") set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}") set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}") set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu_value}") - if [ ${#ips} -le 26 ];then + if [ ${#ips} -le 15 ];then + # if length of ips >= 15, then it is seen as multi-machine + # 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0 save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" nodes=1 else @@ -280,7 +282,7 @@ else nodes=${#ips_array[@]} save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}" fi - + # load pretrain from norm training if current trainer is pact or fpgm trainer # if [ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]; then # set_pretrain="${load_norm_train_model}" @@ -289,7 +291,7 @@ else set_save_model=$(func_set_params "${save_model_key}" "${save_log}") if [ ${#gpu} -le 2 ];then # train with cpu or single gpu cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} " - elif [ ${#ips} -le 26 ];then # train with multi-gpu + elif [ ${#ips} -le 15 ];then # train with multi-gpu cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}" else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" @@ -301,26 +303,26 @@ else eval $cmd status_check $? "${cmd}" "${status_log}" sleep 5 - + if [[ $FILENAME == *GeneralRecognition* ]]; then set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/RecModel/${train_model_name}") else set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${model_name}/${train_model_name}") fi - # save norm trained models to set pretrain for pact training and fpgm training + # save norm trained models to set pretrain for pact training and fpgm training if [ ${trainer} = ${trainer_norm} ]; then load_norm_train_model=${set_eval_pretrain} fi - # run eval + # run eval if [ ${eval_py} != "null" ]; then set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}") - eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" + eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" eval $eval_cmd status_check $? "${eval_cmd}" "${status_log}" sleep 5 fi # run export model - if [ ${run_export} != "null" ]; then + if [ ${run_export} != "null" ]; then # run export model save_infer_path="${save_log}" if [[ $FILENAME == *GeneralRecognition* ]]; then @@ -341,7 +343,7 @@ else cd .. fi eval "unset CUDA_VISIBLE_DEVICES" - done # done with: for trainer in ${trainer_list[*]}; do - done # done with: for autocast in ${autocast_list[*]}; do + done # done with: for trainer in ${trainer_list[*]}; do + done # done with: for autocast in ${autocast_list[*]}; do done # done with: for gpu in ${gpu_list[*]}; do fi # end if [ ${MODE} = "infer" ]; then