add train_fleet_infer chain
parent
787f91b615
commit
e8ced77fdb
|
@ -112,3 +112,4 @@ bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/MobileNetV3/Mo
|
|||
- [test_lite_arm_cpu_cpp 使用](docs/test_lite_arm_cpu_cpp.md): 测试基于Paddle-Lite的ARM CPU端c++预测部署功能.
|
||||
- [test_paddle2onnx 使用](docs/test_paddle2onnx.md):测试Paddle2ONNX的模型转化功能,并验证正确性。
|
||||
- [test_serving_infer_python 使用](docs/test_serving_infer_python.md):测试python serving功能。
|
||||
- [test_train_fleet_inference_python 使用](./docs/test_train_fleet_inference_python.md):测试基于Python的多机多卡训练与推理等基本功能。
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
===========================train_params===========================
|
||||
model_name:GeneralRecognition_PPLCNet_x2_5
|
||||
python:python3.7
|
||||
gpu_list:192.168.0.1,192.168.0.2;0,1
|
||||
-o Global.device:gpu
|
||||
-o Global.auto_cast:null
|
||||
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
|
||||
-o Global.output_dir:./output/
|
||||
-o DataLoader.Train.sampler.batch_size:8
|
||||
-o Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./dataset/ILSVRC2012/val
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:tools/eval.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
|
||||
null:null
|
||||
##
|
||||
===========================infer_params==========================
|
||||
-o Global.save_inference_dir:./inference
|
||||
-o Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
kl_quant:null
|
||||
export2:null
|
||||
pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/general_PPLCNet_x2_5_pretrained_v1.0.pdparams
|
||||
infer_model:../inference/
|
||||
infer_export:True
|
||||
infer_quant:Fasle
|
||||
inference:python/predict_rec.py -c configs/inference_rec.yaml
|
||||
-o Global.use_gpu:True|False
|
||||
-o Global.enable_mkldnn:True|False
|
||||
-o Global.cpu_num_threads:1|6
|
||||
-o Global.batch_size:1|16
|
||||
-o Global.use_tensorrt:True|False
|
||||
-o Global.use_fp16:True|False
|
||||
-o Global.rec_inference_model_dir:../inference
|
||||
-o Global.infer_imgs:../dataset/Aliproduct/demo_test/
|
||||
-o Global.save_log_path:null
|
||||
-o Global.benchmark:True
|
||||
null:null
|
||||
null:null
|
||||
===========================infer_benchmark_params==========================
|
||||
random_infer_input:[{float32,[3,224,224]}]
|
|
@ -0,0 +1,53 @@
|
|||
===========================train_params===========================
|
||||
model_name:PPHGNet_small
|
||||
python:python3.7
|
||||
gpu_list:192.168.0.1,192.168.0.2;0,1
|
||||
-o Global.device:gpu
|
||||
-o Global.auto_cast:null
|
||||
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
|
||||
-o Global.output_dir:./output/
|
||||
-o DataLoader.Train.sampler.batch_size:8
|
||||
-o Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./dataset/ILSVRC2012/val
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:tools/eval.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
|
||||
null:null
|
||||
##
|
||||
===========================infer_params==========================
|
||||
-o Global.save_inference_dir:./inference
|
||||
-o Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
kl_quant:null
|
||||
export2:null
|
||||
pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNet_small_pretrained.pdparams
|
||||
infer_model:../inference/
|
||||
infer_export:True
|
||||
infer_quant:Fasle
|
||||
inference:python/predict_cls.py -c configs/inference_cls.yaml -o PreProcess.transform_ops.0.ResizeImage.resize_short=236
|
||||
-o Global.use_gpu:True|False
|
||||
-o Global.enable_mkldnn:True|False
|
||||
-o Global.cpu_num_threads:1|6
|
||||
-o Global.batch_size:1|16
|
||||
-o Global.use_tensorrt:True|False
|
||||
-o Global.use_fp16:True|False
|
||||
-o Global.inference_model_dir:../inference
|
||||
-o Global.infer_imgs:../dataset/ILSVRC2012/val
|
||||
-o Global.save_log_path:null
|
||||
-o Global.benchmark:True
|
||||
null:null
|
||||
===========================infer_benchmark_params==========================
|
||||
random_infer_input:[{float32,[3,224,224]}]
|
|
@ -0,0 +1,53 @@
|
|||
===========================train_params===========================
|
||||
model_name:PPLCNet_x1_0
|
||||
python:python3.7
|
||||
gpu_list:192.168.0.1,192.168.0.2;0,1
|
||||
-o Global.device:gpu
|
||||
-o Global.auto_cast:null
|
||||
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
|
||||
-o Global.output_dir:./output/
|
||||
-o DataLoader.Train.sampler.batch_size:8
|
||||
-o Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./dataset/ILSVRC2012/val
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:tools/eval.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
|
||||
null:null
|
||||
##
|
||||
===========================infer_params==========================
|
||||
-o Global.save_inference_dir:./inference
|
||||
-o Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
kl_quant:null
|
||||
export2:null
|
||||
pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams
|
||||
infer_model:../inference/
|
||||
infer_export:True
|
||||
infer_quant:Fasle
|
||||
inference:python/predict_cls.py -c configs/inference_cls.yaml
|
||||
-o Global.use_gpu:True|False
|
||||
-o Global.enable_mkldnn:True|False
|
||||
-o Global.cpu_num_threads:1|6
|
||||
-o Global.batch_size:1|16
|
||||
-o Global.use_tensorrt:True|False
|
||||
-o Global.use_fp16:True|False
|
||||
-o Global.inference_model_dir:../inference
|
||||
-o Global.infer_imgs:../dataset/ILSVRC2012/val
|
||||
-o Global.save_log_path:null
|
||||
-o Global.benchmark:True
|
||||
null:null
|
||||
===========================infer_benchmark_params==========================
|
||||
random_infer_input:[{float32,[3,224,224]}]
|
|
@ -0,0 +1,53 @@
|
|||
===========================train_params===========================
|
||||
model_name:PPLCNetV2_base
|
||||
python:python3.7
|
||||
gpu_list:192.168.0.1,192.168.0.2;0,1
|
||||
-o Global.device:gpu
|
||||
-o Global.auto_cast:null
|
||||
-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
|
||||
-o Global.output_dir:./output/
|
||||
-o DataLoader.Train.sampler.first_bs:8
|
||||
-o Global.pretrained_model:null
|
||||
train_model_name:latest
|
||||
train_infer_img_dir:./dataset/ILSVRC2012/val
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml -o Global.seed=1234 -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
|
||||
pact_train:null
|
||||
fpgm_train:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:tools/eval.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
|
||||
null:null
|
||||
##
|
||||
===========================infer_params==========================
|
||||
-o Global.save_inference_dir:./inference
|
||||
-o Global.pretrained_model:
|
||||
norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
kl_quant:null
|
||||
export2:null
|
||||
pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_pretrained.pdparams
|
||||
infer_model:../inference/
|
||||
infer_export:True
|
||||
infer_quant:Fasle
|
||||
inference:python/predict_cls.py -c configs/inference_cls.yaml
|
||||
-o Global.use_gpu:True|False
|
||||
-o Global.enable_mkldnn:True|False
|
||||
-o Global.cpu_num_threads:1|6
|
||||
-o Global.batch_size:1|16
|
||||
-o Global.use_tensorrt:True|False
|
||||
-o Global.use_fp16:True|False
|
||||
-o Global.inference_model_dir:../inference
|
||||
-o Global.infer_imgs:../dataset/ILSVRC2012/val
|
||||
-o Global.save_log_path:null
|
||||
-o Global.benchmark:True
|
||||
null:null
|
||||
===========================infer_benchmark_params==========================
|
||||
random_infer_input:[{float32,[3,224,224]}]
|
|
@ -0,0 +1,121 @@
|
|||
# Linux GPU/CPU 多机多卡训练推理测试
|
||||
|
||||
Linux GPU/CPU 多机多卡训练推理测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的多机多卡模型训练、评估、推理等基本功能。
|
||||
|
||||
## 1. 测试结论汇总
|
||||
|
||||
- 训练相关:
|
||||
|
||||
| 算法名称 | 模型名称 | 多机多卡 |
|
||||
| :-------: | :-----------------: | :--------: |
|
||||
| PPLCNet | PPLCNet_x1_0 | 分布式训练 |
|
||||
| PPLCNetV2 | PPLCNetV2_base | 分布式训练 |
|
||||
| PPHGNet | PPHGNet_small | 分布式训练 |
|
||||
| PP-ShiTu | PPShiTu_general_rec | 分布式训练 |
|
||||
|
||||
|
||||
- 推理相关:
|
||||
|
||||
| 算法名称 | 模型名称 | device_CPU | device_GPU | batchsize |
|
||||
| :-------: | :-----------------: | :--------: | :--------: | :-------: |
|
||||
| PPLCNet | PPLCNet_x1_0 | 支持 | 支持 | 1 |
|
||||
| PPLCNetV2 | PPLCNetV2_base | 支持 | 支持 | 1 |
|
||||
| PPHGNet | PPHGNet_small | 支持 | 支持 | 1 |
|
||||
| PP-ShiTu | PPShiTu_general_rec | 支持 | 支持 | 1 |
|
||||
|
||||
|
||||
## 2. 测试流程
|
||||
|
||||
运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
|
||||
|
||||
**下面以 PPLCNet_x1_0 模型为例,介绍测试流程**
|
||||
|
||||
### 2.1 功能测试
|
||||
|
||||
#### 2.1.1 修改配置文件
|
||||
|
||||
首先,修改配置文件`test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt`中的`gpu_list`设置:假设两台机器的`ip`地址分别为`192.168.0.1`和`192.168.0.2`,则对应的配置文件`gpu_list`字段需要修改为`gpu_list:192.168.0.1,192.168.0.2;0,1`。
|
||||
|
||||
**`ip`地址查看命令为`ifconfig`,在`inet addr:`字段后的即为ip地址**。
|
||||
|
||||
|
||||
#### 2.1.2 准备数据
|
||||
|
||||
运行`prepare.sh`准备数据和模型,数据准备命令如下所示。
|
||||
|
||||
```shell
|
||||
bash test_tipc/prepare.sh test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt lite_train_lite_infer
|
||||
```
|
||||
|
||||
**注意:** 由于是多机训练,这里需要在所有节点上都运行一次启动上述命令来准备数据。
|
||||
|
||||
#### 2.1.3 修改起始端口开始测试
|
||||
|
||||
在多机的节点上使用下面的命令设置分布式的起始端口(否则后面运行的时候会由于无法找到运行端口而hang住),一般建议设置在`10000~20000`之间。
|
||||
|
||||
```shell
|
||||
export FLAGS_START_PORT=17000
|
||||
```
|
||||
**注意:** 上述修改起始端口命令同样需要在所有节点上都执行一次。
|
||||
|
||||
接下来就可以开始执行测试,命令如下所示。
|
||||
```shell
|
||||
bash test_tipc/test_train_inference_python.sh test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt
|
||||
```
|
||||
|
||||
**注意:** 由于是多机训练,这里需要在所有的节点上均运行启动上述命令进行测试。
|
||||
|
||||
|
||||
#### 2.1.4 输出结果
|
||||
|
||||
输出结果保存在`test_tipc/output/PPLCNet_x1_0/results_python.log`,内容如下,以`Run successfully`开头表示测试命令正常,否则为测试失败。
|
||||
|
||||
```bash
|
||||
Run successfully with command - python3.7 -m paddle.distributed.launch --ips=192.168.0.1,192.168.0.2 --gpus=0,1 tools/train.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml -o Global.seed=1234 -o DataL
|
||||
oader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o Global.device=gpu -o Global.output_dir=./test_tipc/output/PPLCNet_x1_0/norm_train_gpus_0,
|
||||
1_autocast_null_nodes_2 -o Global.epochs=2 -o DataLoader.Train.sampler.batch_size=8 !
|
||||
...
|
||||
...
|
||||
Run successfully with command - python3.7 python/predict_cls.py -c configs/inference_cls.yaml -o Global.use_gpu=False -o Global.enable_mkldnn=True -o Global.cpu_num_threads=1 -o Global.inference_model_dir=.././t
|
||||
est_tipc/output/PPLCNet_x1_0/norm_train_gpus_0,1_autocast_null_nodes_2 -o Global.batch_size=16 -o Global.infer_imgs=../dataset/ILSVRC2012/val -o Global.benchmark=True > .././test_tipc/output/PPLCNet_x1_0/infer_cpu_us
|
||||
emkldnn_True_threads_1_batchsize_16.log 2>&1 !
|
||||
```
|
||||
|
||||
在配置文件中默认设置`-o Global.benchmark:True`表示开启benchmark选项,此时可以得到测试的详细数据,包含运行环境信息(系统版本、CUDA版本、CUDNN版本、驱动版本),Paddle版本信息,参数设置信息(运行设备、线程数、是否开启内存优化等),模型信息(模型名称、精度),数据信息(batchsize、是否为动态shape等),性能信息(CPU,GPU的占用、运行耗时、预处理耗时、推理耗时、后处理耗时),内容如下所示:
|
||||
|
||||
```log
|
||||
[2022/06/07 17:01:41] root INFO: ---------------------- Env info ----------------------
|
||||
[2022/06/07 17:01:41] root INFO: OS_version: CentOS 6.10
|
||||
[2022/06/07 17:01:41] root INFO: CUDA_version: 10.1.243
|
||||
[2022/06/07 17:01:41] root INFO: CUDNN_version: None.None.None
|
||||
[2022/06/07 17:01:41] root INFO: drivier_version: 460.32.03
|
||||
[2022/06/07 17:01:41] root INFO: ---------------------- Paddle info ----------------------
|
||||
[2022/06/07 17:01:41] root INFO: paddle_version: 2.3.0-rc0
|
||||
[2022/06/07 17:01:41] root INFO: paddle_version: 2.3.0-rc0
|
||||
[2022/06/07 17:01:41] root INFO: paddle_commit: 5d4980c052583fec022812d9c29460aff7cdc18b
|
||||
[2022/06/07 17:01:41] root INFO: log_api_version: 1.0
|
||||
[2022/06/07 17:01:41] root INFO: ----------------------- Conf info -----------------------
|
||||
[2022/06/07 17:01:41] root INFO: runtime_device: cpu
|
||||
[2022/06/07 17:01:41] root INFO: ir_optim: True
|
||||
[2022/06/07 17:01:41] root INFO: enable_memory_optim: True
|
||||
[2022/06/07 17:01:41] root INFO: enable_tensorrt: False
|
||||
[2022/06/07 17:01:41] root INFO: enable_mkldnn: False
|
||||
[2022/06/07 17:01:41] root INFO: cpu_math_library_num_threads: 6
|
||||
[2022/06/07 17:01:41] root INFO: ----------------------- Model info ----------------------
|
||||
[2022/06/07 17:01:41] root INFO: model_name: cls
|
||||
[2022/06/07 17:01:41] root INFO: precision: fp32
|
||||
[2022/06/07 17:01:41] root INFO: ----------------------- Data info -----------------------
|
||||
[2022/06/07 17:01:41] root INFO: batch_size: 16
|
||||
[2022/06/07 17:01:41] root INFO: input_shape: [3, 224, 224]
|
||||
[2022/06/07 17:01:41] root INFO: data_num: 3
|
||||
[2022/06/07 17:01:41] root INFO: ----------------------- Perf info -----------------------
|
||||
[2022/06/07 17:01:41] root INFO: cpu_rss(MB): 726.5586, gpu_rss(MB): None, gpu_util: None%
|
||||
[2022/06/07 17:01:41] root INFO: total time spent(s): 0.3527
|
||||
[2022/06/07 17:01:41] root INFO: preprocess_time(ms): 33.2723, inference_time(ms): 317.9824, postprocess_time(ms): 1.4579
|
||||
```
|
||||
|
||||
该信息可以在运行log中查看,log位置在`test_tipc/output/PPLCNet_x1_0/infer_gpu_usetrt_True_precision_True_batchsize_1.log`。
|
||||
|
||||
如果运行失败,也会在终端中输出运行失败的日志信息以及对应的运行命令。可以基于该命令,分析运行失败的原因。
|
||||
|
||||
**注意:** 由于分布式训练时,仅在`trainer_id=0`所在的节点中保存模型,因此其他的节点中在运行模型导出与推理时会因为找不到保存的模型而报错,为正常现象。
|
|
@ -43,7 +43,7 @@ function func_get_url_file_name() {
|
|||
|
||||
model_name=$(func_parser_value "${lines[1]}")
|
||||
|
||||
if [ ${MODE} = "cpp_infer" ]; then
|
||||
if [[ ${MODE} = "cpp_infer" ]]; then
|
||||
if [ -d "./deploy/cpp/opencv-3.4.7/opencv3/" ] && [ $(md5sum ./deploy/cpp/opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ]; then
|
||||
echo "################### build opencv skipped ###################"
|
||||
else
|
||||
|
@ -151,7 +151,7 @@ if [[ $FILENAME == *use_dali* ]]; then
|
|||
${python_name} -m pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-nightly-cuda102
|
||||
fi
|
||||
|
||||
if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer" ]; then
|
||||
if [[ ${MODE} = "lite_train_lite_infer" ]] || [[ ${MODE} = "lite_train_whole_infer" ]]; then
|
||||
# pretrain lite train data
|
||||
cd dataset
|
||||
rm -rf ILSVRC2012
|
||||
|
@ -163,7 +163,7 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer"
|
|||
mv val.txt val_list.txt
|
||||
cp -r train/* val/
|
||||
cd ../../
|
||||
elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
|
||||
elif [[ ${MODE} = "whole_infer" ]] || [[ ${MODE} = "klquant_whole_infer" ]]; then
|
||||
# download data
|
||||
cd dataset
|
||||
rm -rf ILSVRC2012
|
||||
|
@ -185,7 +185,7 @@ elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
|
|||
eval $cmd
|
||||
fi
|
||||
|
||||
elif [ ${MODE} = "whole_train_whole_infer" ]; then
|
||||
elif [[ ${MODE} = "whole_train_whole_infer" ]]; then
|
||||
cd dataset
|
||||
rm -rf ILSVRC2012
|
||||
wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_CIFAR100.tar
|
||||
|
@ -197,7 +197,7 @@ elif [ ${MODE} = "whole_train_whole_infer" ]; then
|
|||
cd ../../
|
||||
fi
|
||||
|
||||
if [ ${MODE} = "serving_infer" ]; then
|
||||
if [[ ${MODE} = "serving_infer" ]]; then
|
||||
# prepare serving env
|
||||
python_name=$(func_parser_value "${lines[2]}")
|
||||
${python_name} -m pip install install paddle-serving-server-gpu==0.7.0.post102
|
||||
|
@ -225,7 +225,7 @@ if [ ${MODE} = "serving_infer" ]; then
|
|||
unset https_proxy
|
||||
fi
|
||||
|
||||
if [ ${MODE} = "paddle2onnx_infer" ]; then
|
||||
if [[ ${MODE} = "paddle2onnx_infer" ]]; then
|
||||
# prepare paddle2onnx env
|
||||
python_name=$(func_parser_value "${lines[2]}")
|
||||
inference_model_url=$(func_parser_value "${lines[10]}")
|
||||
|
@ -241,7 +241,7 @@ if [ ${MODE} = "paddle2onnx_infer" ]; then
|
|||
cd ../../
|
||||
fi
|
||||
|
||||
if [ ${MODE} = "benchmark_train" ]; then
|
||||
if [[ ${MODE} = "benchmark_train" ]]; then
|
||||
pip install -r requirements.txt
|
||||
cd dataset
|
||||
rm -rf ILSVRC2012
|
||||
|
|
|
@ -60,12 +60,12 @@ kl_quant_cmd_value=$(func_parser_value "${lines[33]}")
|
|||
export_key2=$(func_parser_key "${lines[34]}")
|
||||
export_value2=$(func_parser_value "${lines[34]}")
|
||||
|
||||
# parser inference model
|
||||
# parser inference model
|
||||
infer_model_dir_list=$(func_parser_value "${lines[36]}")
|
||||
infer_export_flag=$(func_parser_value "${lines[37]}")
|
||||
infer_is_quant=$(func_parser_value "${lines[38]}")
|
||||
|
||||
# parser inference
|
||||
# parser inference
|
||||
inference_py=$(func_parser_value "${lines[39]}")
|
||||
use_gpu_key=$(func_parser_key "${lines[40]}")
|
||||
use_gpu_list=$(func_parser_value "${lines[40]}")
|
||||
|
@ -90,7 +90,7 @@ infer_value1=$(func_parser_value "${lines[50]}")
|
|||
if [ ! $epoch_num ]; then
|
||||
epoch_num=2
|
||||
fi
|
||||
if [ $MODE = 'benchmark_train' ]; then
|
||||
if [[ $MODE = 'benchmark_train' ]]; then
|
||||
epoch_num=1
|
||||
fi
|
||||
|
||||
|
@ -106,7 +106,7 @@ function func_inference(){
|
|||
_log_path=$4
|
||||
_img_dir=$5
|
||||
_flag_quant=$6
|
||||
# inference
|
||||
# inference
|
||||
for use_gpu in ${use_gpu_list[*]}; do
|
||||
if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
|
||||
for use_mkldnn in ${use_mkldnn_list[*]}; do
|
||||
|
@ -161,7 +161,7 @@ function func_inference(){
|
|||
done
|
||||
}
|
||||
|
||||
if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
|
||||
if [[ ${MODE} = "whole_infer" ]] || [[ ${MODE} = "klquant_whole_infer" ]]; then
|
||||
IFS="|"
|
||||
infer_export_flag=(${infer_export_flag})
|
||||
if [ ${infer_export_flag} != "null" ] && [ ${infer_export_flag} != "False" ]; then
|
||||
|
@ -171,7 +171,7 @@ if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
|
|||
fi
|
||||
fi
|
||||
|
||||
if [ ${MODE} = "whole_infer" ]; then
|
||||
if [[ ${MODE} = "whole_infer" ]]; then
|
||||
GPUID=$3
|
||||
if [ ${#GPUID} -le 0 ];then
|
||||
env=" "
|
||||
|
@ -191,7 +191,7 @@ if [ ${MODE} = "whole_infer" ]; then
|
|||
done
|
||||
cd ..
|
||||
|
||||
elif [ ${MODE} = "klquant_whole_infer" ]; then
|
||||
elif [[ ${MODE} = "klquant_whole_infer" ]]; then
|
||||
# for kl_quant
|
||||
if [ ${kl_quant_cmd_value} != "null" ] && [ ${kl_quant_cmd_value} != "False" ]; then
|
||||
echo "kl_quant"
|
||||
|
@ -234,7 +234,7 @@ else
|
|||
env=" "
|
||||
fi
|
||||
for autocast in ${autocast_list[*]}; do
|
||||
for trainer in ${trainer_list[*]}; do
|
||||
for trainer in ${trainer_list[*]}; do
|
||||
flag_quant=False
|
||||
if [ ${trainer} = ${pact_key} ]; then
|
||||
run_train=${pact_trainer}
|
||||
|
@ -263,14 +263,16 @@ else
|
|||
if [ ${run_train} = "null" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
|
||||
set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
|
||||
set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
|
||||
set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
|
||||
set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
|
||||
set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu_value}")
|
||||
if [ ${#ips} -le 26 ];then
|
||||
if [ ${#ips} -le 15 ];then
|
||||
# if length of ips >= 15, then it is seen as multi-machine
|
||||
# 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0
|
||||
save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
|
||||
nodes=1
|
||||
else
|
||||
|
@ -280,7 +282,7 @@ else
|
|||
nodes=${#ips_array[@]}
|
||||
save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
|
||||
fi
|
||||
|
||||
|
||||
# load pretrain from norm training if current trainer is pact or fpgm trainer
|
||||
# if [ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]; then
|
||||
# set_pretrain="${load_norm_train_model}"
|
||||
|
@ -289,7 +291,7 @@ else
|
|||
set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
|
||||
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
|
||||
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} "
|
||||
elif [ ${#ips} -le 26 ];then # train with multi-gpu
|
||||
elif [ ${#ips} -le 15 ];then # train with multi-gpu
|
||||
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}"
|
||||
else # train with multi-machine
|
||||
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
|
||||
|
@ -301,26 +303,26 @@ else
|
|||
eval $cmd
|
||||
status_check $? "${cmd}" "${status_log}"
|
||||
sleep 5
|
||||
|
||||
|
||||
if [[ $FILENAME == *GeneralRecognition* ]]; then
|
||||
set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/RecModel/${train_model_name}")
|
||||
else
|
||||
set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${model_name}/${train_model_name}")
|
||||
fi
|
||||
# save norm trained models to set pretrain for pact training and fpgm training
|
||||
# save norm trained models to set pretrain for pact training and fpgm training
|
||||
if [ ${trainer} = ${trainer_norm} ]; then
|
||||
load_norm_train_model=${set_eval_pretrain}
|
||||
fi
|
||||
# run eval
|
||||
# run eval
|
||||
if [ ${eval_py} != "null" ]; then
|
||||
set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}")
|
||||
eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}"
|
||||
eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}"
|
||||
eval $eval_cmd
|
||||
status_check $? "${eval_cmd}" "${status_log}"
|
||||
sleep 5
|
||||
fi
|
||||
# run export model
|
||||
if [ ${run_export} != "null" ]; then
|
||||
if [ ${run_export} != "null" ]; then
|
||||
# run export model
|
||||
save_infer_path="${save_log}"
|
||||
if [[ $FILENAME == *GeneralRecognition* ]]; then
|
||||
|
@ -341,7 +343,7 @@ else
|
|||
cd ..
|
||||
fi
|
||||
eval "unset CUDA_VISIBLE_DEVICES"
|
||||
done # done with: for trainer in ${trainer_list[*]}; do
|
||||
done # done with: for autocast in ${autocast_list[*]}; do
|
||||
done # done with: for trainer in ${trainer_list[*]}; do
|
||||
done # done with: for autocast in ${autocast_list[*]}; do
|
||||
done # done with: for gpu in ${gpu_list[*]}; do
|
||||
fi # end if [ ${MODE} = "infer" ]; then
|
||||
|
|
Loading…
Reference in New Issue