add train_fleet_infer chain

2022-06-07 17:49:13 +08:00 · 2022-06-07 17:49:13 +08:00 · e8ced77fdb
parent 787f91b615
commit e8ced77fdb
8 changed files with 363 additions and 26 deletions
--- a/test_tipc/README.md
+++ b/test_tipc/README.md
@ -112,3 +112,4 @@ bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/MobileNetV3/Mo
 - [test_lite_arm_cpu_cpp 使用](docs/test_lite_arm_cpu_cpp.md): 测试基于Paddle-Lite的ARM CPU端c++预测部署功能.
 - [test_paddle2onnx 使用](docs/test_paddle2onnx.md)：测试Paddle2ONNX的模型转化功能，并验证正确性。
 - [test_serving_infer_python 使用](docs/test_serving_infer_python.md)：测试python serving功能。
+- [test_train_fleet_inference_python 使用](./docs/test_train_fleet_inference_python.md)：测试基于Python的多机多卡训练与推理等基本功能。
--- a/test_tipc/config/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_train_fleet_infer_python.txt
+++ b/test_tipc/config/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_train_fleet_infer_python.txt
@ -0,0 +1,54 @@
+===========================train_params===========================
+model_name:GeneralRecognition_PPLCNet_x2_5
+python:python3.7
+gpu_list:192.168.0.1,192.168.0.2;0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+kl_quant:null
+export2:null
+pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/general_PPLCNet_x2_5_pretrained_v1.0.pdparams
+infer_model:../inference/
+infer_export:True
+infer_quant:Fasle
+inference:python/predict_rec.py -c configs/inference_rec.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1|16
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.rec_inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/Aliproduct/demo_test/
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
+null:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3,224,224]}]
--- a/test_tipc/config/PPHGNet/PPHGNet_small_fleet_train_infer_python.txt
+++ b/test_tipc/config/PPHGNet/PPHGNet_small_fleet_train_infer_python.txt
@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:PPHGNet_small
+python:python3.7
+gpu_list:192.168.0.1,192.168.0.2;0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+kl_quant:null
+export2:null
+pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNet_small_pretrained.pdparams
+infer_model:../inference/
+infer_export:True
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml -o PreProcess.transform_ops.0.ResizeImage.resize_short=236
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1|16
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3,224,224]}]
--- a/test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt
+++ b/test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt
@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:PPLCNet_x1_0
+python:python3.7
+gpu_list:192.168.0.1,192.168.0.2;0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params=========================== 
+eval:tools/eval.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+kl_quant:null
+export2:null
+pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams
+infer_model:../inference/
+infer_export:True
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1|16
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3,224,224]}]
--- a/test_tipc/config/PPLCNetV2/PPLCNetV2_base_fleet_train_infer_python.txt
+++ b/test_tipc/config/PPLCNetV2/PPLCNetV2_base_fleet_train_infer_python.txt
@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:PPLCNetV2_base
+python:python3.7
+gpu_list:192.168.0.1,192.168.0.2;0,1
+-o Global.device:gpu
+-o Global.auto_cast:null
+-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.first_bs:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml -o Global.seed=1234 -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:tools/eval.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
+null:null
+##
+===========================infer_params==========================
+-o Global.save_inference_dir:./inference
+-o Global.pretrained_model:
+norm_export:tools/export_model.py -c ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
+quant_export:null
+fpgm_export:null
+distill_export:null
+kl_quant:null
+export2:null
+pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_pretrained.pdparams
+infer_model:../inference/
+infer_export:True
+infer_quant:Fasle
+inference:python/predict_cls.py -c configs/inference_cls.yaml
+-o Global.use_gpu:True|False
+-o Global.enable_mkldnn:True|False
+-o Global.cpu_num_threads:1|6
+-o Global.batch_size:1|16
+-o Global.use_tensorrt:True|False
+-o Global.use_fp16:True|False
+-o Global.inference_model_dir:../inference
+-o Global.infer_imgs:../dataset/ILSVRC2012/val
+-o Global.save_log_path:null
+-o Global.benchmark:True
+null:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3,224,224]}]
--- a/test_tipc/docs/test_train_fleet_inference_python.md
+++ b/test_tipc/docs/test_train_fleet_inference_python.md
@ -0,0 +1,121 @@
+# Linux GPU/CPU 多机多卡训练推理测试
+
+Linux GPU/CPU 多机多卡训练推理测试的主程序为`test_train_inference_python.sh`，可以测试基于Python的多机多卡模型训练、评估、推理等基本功能。
+
+## 1. 测试结论汇总
+
+- 训练相关：
+
+  | 算法名称  |      模型名称       |  多机多卡  |
+  | :-------: | :-----------------: | :--------: |
+  |  PPLCNet  |    PPLCNet_x1_0     | 分布式训练 |
+  | PPLCNetV2 |   PPLCNetV2_base    | 分布式训练 |
+  |  PPHGNet  |    PPHGNet_small    | 分布式训练 |
+  | PP-ShiTu  | PPShiTu_general_rec | 分布式训练 |
+
+
+- 推理相关：
+
+  | 算法名称  |      模型名称       | device_CPU | device_GPU | batchsize |
+  | :-------: | :-----------------: | :--------: | :--------: | :-------: |
+  |  PPLCNet  |    PPLCNet_x1_0     |    支持    |    支持    |     1     |
+  | PPLCNetV2 |   PPLCNetV2_base    |    支持    |    支持    |     1     |
+  |  PPHGNet  |    PPHGNet_small    |    支持    |    支持    |     1     |
+  | PP-ShiTu  | PPShiTu_general_rec |    支持    |    支持    |     1     |
+
+
+## 2. 测试流程
+
+运行环境配置请参考[文档](./install.md)的内容配置TIPC的运行环境。
+
+**下面以 PPLCNet_x1_0 模型为例，介绍测试流程**
+
+### 2.1 功能测试
+
+#### 2.1.1 修改配置文件
+
+首先，修改配置文件`test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt`中的`gpu_list`设置：假设两台机器的`ip`地址分别为`192.168.0.1`和`192.168.0.2`，则对应的配置文件`gpu_list`字段需要修改为`gpu_list:192.168.0.1,192.168.0.2;0,1`。
+
+**`ip`地址查看命令为`ifconfig`，在`inet addr:`字段后的即为ip地址**。
+
+
+#### 2.1.2 准备数据
+
+运行`prepare.sh`准备数据和模型，数据准备命令如下所示。
+
+```shell
+bash test_tipc/prepare.sh test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt lite_train_lite_infer
+```
+
+**注意：** 由于是多机训练，这里需要在所有节点上都运行一次启动上述命令来准备数据。
+
+#### 2.1.3 修改起始端口开始测试
+
+在多机的节点上使用下面的命令设置分布式的起始端口（否则后面运行的时候会由于无法找到运行端口而hang住），一般建议设置在`10000~20000`之间。
+
+```shell
+export FLAGS_START_PORT=17000
+```
+**注意：** 上述修改起始端口命令同样需要在所有节点上都执行一次。
+
+接下来就可以开始执行测试，命令如下所示。
+```shell
+bash test_tipc/test_train_inference_python.sh  test_tipc/config/PPLCNet/PPLCNet_x1_0_train_fleet_infer_python.txt
+```
+
+**注意：** 由于是多机训练，这里需要在所有的节点上均运行启动上述命令进行测试。
+
+
+#### 2.1.4 输出结果
+
+输出结果保存在`test_tipc/output/PPLCNet_x1_0/results_python.log`，内容如下，以`Run successfully`开头表示测试命令正常，否则为测试失败。
+
+```bash
+Run successfully with command - python3.7 -m paddle.distributed.launch --ips=192.168.0.1,192.168.0.2 --gpus=0,1 tools/train.py -c ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml -o Global.seed=1234 -o DataL
+oader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o Global.device=gpu -o Global.output_dir=./test_tipc/output/PPLCNet_x1_0/norm_train_gpus_0,
+1_autocast_null_nodes_2   -o Global.epochs=2   -o DataLoader.Train.sampler.batch_size=8  !
+...
+...
+Run successfully with command - python3.7 python/predict_cls.py -c configs/inference_cls.yaml -o Global.use_gpu=False -o Global.enable_mkldnn=True -o Global.cpu_num_threads=1 -o Global.inference_model_dir=.././t
+est_tipc/output/PPLCNet_x1_0/norm_train_gpus_0,1_autocast_null_nodes_2 -o Global.batch_size=16 -o Global.infer_imgs=../dataset/ILSVRC2012/val -o Global.benchmark=True   > .././test_tipc/output/PPLCNet_x1_0/infer_cpu_us
+emkldnn_True_threads_1_batchsize_16.log 2>&1 !
+```
+
+在配置文件中默认设置`-o Global.benchmark:True`表示开启benchmark选项，此时可以得到测试的详细数据，包含运行环境信息（系统版本、CUDA版本、CUDNN版本、驱动版本），Paddle版本信息，参数设置信息（运行设备、线程数、是否开启内存优化等），模型信息（模型名称、精度），数据信息（batchsize、是否为动态shape等），性能信息（CPU,GPU的占用、运行耗时、预处理耗时、推理耗时、后处理耗时），内容如下所示：
+
+```log
+[2022/06/07 17:01:41] root INFO: ---------------------- Env info ----------------------
+[2022/06/07 17:01:41] root INFO:  OS_version: CentOS 6.10
+[2022/06/07 17:01:41] root INFO:  CUDA_version: 10.1.243
+[2022/06/07 17:01:41] root INFO:  CUDNN_version: None.None.None
+[2022/06/07 17:01:41] root INFO:  drivier_version: 460.32.03
+[2022/06/07 17:01:41] root INFO: ---------------------- Paddle info ----------------------
+[2022/06/07 17:01:41] root INFO:  paddle_version: 2.3.0-rc0
+[2022/06/07 17:01:41] root INFO:  paddle_version: 2.3.0-rc0
+[2022/06/07 17:01:41] root INFO:  paddle_commit: 5d4980c052583fec022812d9c29460aff7cdc18b
+[2022/06/07 17:01:41] root INFO:  log_api_version: 1.0
+[2022/06/07 17:01:41] root INFO: ----------------------- Conf info -----------------------
+[2022/06/07 17:01:41] root INFO:  runtime_device: cpu
+[2022/06/07 17:01:41] root INFO:  ir_optim: True
+[2022/06/07 17:01:41] root INFO:  enable_memory_optim: True
+[2022/06/07 17:01:41] root INFO:  enable_tensorrt: False
+[2022/06/07 17:01:41] root INFO:  enable_mkldnn: False
+[2022/06/07 17:01:41] root INFO:  cpu_math_library_num_threads: 6
+[2022/06/07 17:01:41] root INFO: ----------------------- Model info ----------------------
+[2022/06/07 17:01:41] root INFO:  model_name: cls
+[2022/06/07 17:01:41] root INFO:  precision: fp32
+[2022/06/07 17:01:41] root INFO: ----------------------- Data info -----------------------
+[2022/06/07 17:01:41] root INFO:  batch_size: 16
+[2022/06/07 17:01:41] root INFO:  input_shape: [3, 224, 224]
+[2022/06/07 17:01:41] root INFO:  data_num: 3
+[2022/06/07 17:01:41] root INFO: ----------------------- Perf info -----------------------
+[2022/06/07 17:01:41] root INFO:  cpu_rss(MB): 726.5586, gpu_rss(MB): None, gpu_util: None%
+[2022/06/07 17:01:41] root INFO:  total time spent(s): 0.3527
+[2022/06/07 17:01:41] root INFO:  preprocess_time(ms): 33.2723, inference_time(ms): 317.9824, postprocess_time(ms): 1.4579
+```
+
+该信息可以在运行log中查看，log位置在`test_tipc/output/PPLCNet_x1_0/infer_gpu_usetrt_True_precision_True_batchsize_1.log`。
+
+如果运行失败，也会在终端中输出运行失败的日志信息以及对应的运行命令。可以基于该命令，分析运行失败的原因。
+
+**注意：** 由于分布式训练时，仅在`trainer_id=0`所在的节点中保存模型，因此其他的节点中在运行模型导出与推理时会因为找不到保存的模型而报错，为正常现象。
--- a/test_tipc/prepare.sh
+++ b/test_tipc/prepare.sh
@ -43,7 +43,7 @@ function func_get_url_file_name() {

 model_name=$(func_parser_value "${lines[1]}")

-if [ ${MODE} = "cpp_infer" ]; then
+if [[ ${MODE} = "cpp_infer" ]]; then
    if [ -d "./deploy/cpp/opencv-3.4.7/opencv3/" ] && [ $(md5sum ./deploy/cpp/opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ]; then
        echo "################### build opencv skipped ###################"
    else
@ -151,7 +151,7 @@ if [[ $FILENAME == *use_dali* ]]; then
    ${python_name} -m pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-nightly-cuda102
 fi

-if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer" ]; then
+if [[ ${MODE} = "lite_train_lite_infer" ]] || [[ ${MODE} = "lite_train_whole_infer" ]]; then
    # pretrain lite train data
    cd dataset
    rm -rf ILSVRC2012
@ -163,7 +163,7 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer"
    mv val.txt val_list.txt
    cp -r train/* val/
    cd ../../
-elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
+elif [[ ${MODE} = "whole_infer" ]] || [[ ${MODE} = "klquant_whole_infer" ]]; then
    # download data
    cd dataset
    rm -rf ILSVRC2012
@ -185,7 +185,7 @@ elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
        eval $cmd
    fi

-elif [ ${MODE} = "whole_train_whole_infer" ]; then
+elif [[ ${MODE} = "whole_train_whole_infer" ]]; then
    cd dataset
    rm -rf ILSVRC2012
    wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_CIFAR100.tar
@ -197,7 +197,7 @@ elif [ ${MODE} = "whole_train_whole_infer" ]; then
    cd ../../
 fi

-if [ ${MODE} = "serving_infer" ]; then
+if [[ ${MODE} = "serving_infer" ]]; then
    # prepare serving env
    python_name=$(func_parser_value "${lines[2]}")
    ${python_name} -m pip install install paddle-serving-server-gpu==0.7.0.post102
@ -225,7 +225,7 @@ if [ ${MODE} = "serving_infer" ]; then
    unset https_proxy
 fi

-if [ ${MODE} = "paddle2onnx_infer" ]; then
+if [[ ${MODE} = "paddle2onnx_infer" ]]; then
    # prepare paddle2onnx env
    python_name=$(func_parser_value "${lines[2]}")
    inference_model_url=$(func_parser_value "${lines[10]}")
@ -241,7 +241,7 @@ if [ ${MODE} = "paddle2onnx_infer" ]; then
    cd ../../
 fi

-if [ ${MODE} = "benchmark_train" ]; then
+if [[ ${MODE} = "benchmark_train" ]]; then
    pip install -r requirements.txt
    cd dataset
    rm -rf ILSVRC2012
--- a/test_tipc/test_train_inference_python.sh
+++ b/test_tipc/test_train_inference_python.sh
@ -60,12 +60,12 @@ kl_quant_cmd_value=$(func_parser_value "${lines[33]}")
 export_key2=$(func_parser_key "${lines[34]}")
 export_value2=$(func_parser_value "${lines[34]}")

-# parser inference model 
+# parser inference model
 infer_model_dir_list=$(func_parser_value "${lines[36]}")
 infer_export_flag=$(func_parser_value "${lines[37]}")
 infer_is_quant=$(func_parser_value "${lines[38]}")

-# parser inference 
+# parser inference
 inference_py=$(func_parser_value "${lines[39]}")
 use_gpu_key=$(func_parser_key "${lines[40]}")
 use_gpu_list=$(func_parser_value "${lines[40]}")
@ -90,7 +90,7 @@ infer_value1=$(func_parser_value "${lines[50]}")
 if [ ! $epoch_num ]; then
  epoch_num=2
 fi
-if [ $MODE = 'benchmark_train' ]; then
+if [[ $MODE = 'benchmark_train' ]]; then
  epoch_num=1
 fi

@ -106,7 +106,7 @@ function func_inference(){
    _log_path=$4
    _img_dir=$5
    _flag_quant=$6
-    # inference 
+    # inference
    for use_gpu in ${use_gpu_list[*]}; do
        if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
            for use_mkldnn in ${use_mkldnn_list[*]}; do
@ -161,7 +161,7 @@ function func_inference(){
    done
 }

-if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
+if [[ ${MODE} = "whole_infer" ]] || [[ ${MODE} = "klquant_whole_infer" ]]; then
   IFS="|"
   infer_export_flag=(${infer_export_flag})
   if [ ${infer_export_flag} != "null" ]  && [ ${infer_export_flag} != "False" ]; then
@ -171,7 +171,7 @@ if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then
   fi
 fi

-if [ ${MODE} = "whole_infer" ]; then
+if [[ ${MODE} = "whole_infer" ]]; then
    GPUID=$3
    if [ ${#GPUID} -le 0 ];then
        env=" "
@ -191,7 +191,7 @@ if [ ${MODE} = "whole_infer" ]; then
    done
    cd ..

-elif [ ${MODE} = "klquant_whole_infer" ]; then
+elif [[ ${MODE} = "klquant_whole_infer" ]]; then
    # for kl_quant
    if [ ${kl_quant_cmd_value} != "null" ] && [ ${kl_quant_cmd_value} != "False" ]; then
 	echo "kl_quant"
@ -234,7 +234,7 @@ else
            env=" "
        fi
        for autocast in ${autocast_list[*]}; do
-            for trainer in ${trainer_list[*]}; do 
+            for trainer in ${trainer_list[*]}; do
                flag_quant=False
                if [ ${trainer} = ${pact_key} ]; then
                    run_train=${pact_trainer}
@ -263,14 +263,16 @@ else
                if [ ${run_train} = "null" ]; then
                    continue
                fi
-                
+
                set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
                set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
                set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
                set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
                set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
                set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu_value}")
-                if [ ${#ips} -le 26 ];then
+                if [ ${#ips} -le 15 ];then
+                    # if length of ips >= 15, then it is seen as multi-machine
+                    # 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0
                    save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
                    nodes=1
                else
@ -280,7 +282,7 @@ else
                    nodes=${#ips_array[@]}
                    save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
                fi
-                
+
                # load pretrain from norm training if current trainer is pact or fpgm trainer
                # if [ ${trainer} = ${pact_key} ] || [ ${trainer} = ${fpgm_key} ]; then
                #    set_pretrain="${load_norm_train_model}"
@ -289,7 +291,7 @@ else
                set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
                if [ ${#gpu} -le 2 ];then  # train with cpu or single gpu
                    cmd="${python} ${run_train} ${set_use_gpu}  ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} "
-                elif [ ${#ips} -le 26 ];then  # train with multi-gpu
+                elif [ ${#ips} -le 15 ];then  # train with multi-gpu
                    cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}"
                else     # train with multi-machine
                    cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
@ -301,26 +303,26 @@ else
                eval $cmd
                status_check $? "${cmd}" "${status_log}"
                sleep 5
-		
+
 		if [[ $FILENAME == *GeneralRecognition* ]]; then
 		    set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/RecModel/${train_model_name}")
 		else
                    set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${model_name}/${train_model_name}")
 		fi
-                # save norm trained models to set pretrain for pact training and fpgm training 
+                # save norm trained models to set pretrain for pact training and fpgm training
                if [ ${trainer} = ${trainer_norm} ]; then
                    load_norm_train_model=${set_eval_pretrain}
                fi
-                # run eval 
+                # run eval
                if [ ${eval_py} != "null" ]; then
                    set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}")
-                    eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" 
+                    eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}"
                    eval $eval_cmd
                    status_check $? "${eval_cmd}" "${status_log}"
                    sleep 5
                fi
                # run export model
-                if [ ${run_export} != "null" ]; then 
+                if [ ${run_export} != "null" ]; then
                    # run export model
                    save_infer_path="${save_log}"
 		    if [[ $FILENAME == *GeneralRecognition* ]]; then
@ -341,7 +343,7 @@ else
 		    cd ..
                fi
                eval "unset CUDA_VISIBLE_DEVICES"
-            done  # done with:    for trainer in ${trainer_list[*]}; do 
-        done      # done with:    for autocast in ${autocast_list[*]}; do 
+            done  # done with:    for trainer in ${trainer_list[*]}; do
+        done      # done with:    for autocast in ${autocast_list[*]}; do
    done          # done with:    for gpu in ${gpu_list[*]}; do
 fi  # end if [ ${MODE} = "infer" ]; then