From 2005cc3e5a85a6d875c9ff22321b5f5072eca8e1 Mon Sep 17 00:00:00 2001 From: stephon Date: Fri, 15 Oct 2021 08:30:51 +0000 Subject: [PATCH 01/10] add amp train --- configs/det/det_mv3_db_amp.yml | 135 +++++++++++++++++++++++++++++++++ tools/program.py | 25 ++++-- tools/train.py | 17 +++++ 3 files changed, 172 insertions(+), 5 deletions(-) create mode 100644 configs/det/det_mv3_db_amp.yml diff --git a/configs/det/det_mv3_db_amp.yml b/configs/det/det_mv3_db_amp.yml new file mode 100644 index 000000000..772342a2d --- /dev/null +++ b/configs/det/det_mv3_db_amp.yml @@ -0,0 +1,135 @@ +Global: + use_gpu: true + epoch_num: 1200 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/db_mv3/ + save_epoch_step: 1200 + # evaluation is run every 2000 iterations + eval_batch_step: [0, 2000] + cal_metric_during_train: False + pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_en/img_10.jpg + save_res_path: ./output/det_db/predicts_db.txt + +AMP: + scale_loss: 1024.0 + use_dynamic_loss_scaling: True + +Architecture: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: DBFPN + out_channels: 256 + Head: + name: DBHead + k: 50 + +Loss: + name: DBLoss + balance_loss: true + main_loss_type: DiceLoss + alpha: 5 + beta: 10 + ohem_ratio: 3 + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.001 + regularizer: + name: 'L2' + factor: 0 + +PostProcess: + name: DBPostProcess + thresh: 0.3 + box_thresh: 0.6 + max_candidates: 1000 + unclip_ratio: 1.5 + +Metric: + name: DetMetric + main_indicator: hmean + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt + ratio_list: [1.0] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - IaaAugment: + augmenter_args: + - { 'type': Fliplr, 'args': { 'p': 0.5 } } + - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } + - { 'type': Resize, 'args': { 'size': [0.5, 3] } } + - EastRandomCropData: + size: [640, 640] + max_tries: 50 + keep_ratio: true + - MakeBorderMap: + shrink_ratio: 0.4 + thresh_min: 0.3 + thresh_max: 0.7 + - MakeShrinkMap: + shrink_ratio: 0.4 + min_text_size: 8 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list + loader: + shuffle: True + drop_last: False + batch_size_per_card: 16 + num_workers: 8 + use_shared_memory: False + +Eval: + dataset: + name: SimpleDataSet + data_dir: ./train_data/icdar2015/text_localization/ + label_file_list: + - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - DetLabelEncode: # Class handling label + - DetResizeForTest: + image_shape: [736, 1280] + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 1 # must be 1 + num_workers: 8 + use_shared_memory: False diff --git a/tools/program.py b/tools/program.py index 798e6dff2..5963016b6 100755 --- a/tools/program.py +++ b/tools/program.py @@ -226,14 +226,29 @@ def train(config, images = batch[0] if use_srn: model_average = True - if model_type == 'table' or extra_input: - preds = model(images, data=batch[1:]) + + # use amp + if scaler: + with paddle.amp.auto_cast(): + if model_type == 'table' or extra_input: + preds = model(images, data=batch[1:]) + else: + preds = model(images) else: - preds = model(images) + if model_type == 'table' or extra_input: + preds = model(images, data=batch[1:]) + else: + preds = model(images) loss = loss_class(preds, batch) avg_loss = loss['loss'] - avg_loss.backward() - optimizer.step() + + if scaler: + scaled_avg_loss = scaler.scale(avg_loss) + scaled_avg_loss.backward() + scaler.minimize(optimizer, scaled_avg_loss) + else: + avg_loss.backward() + optimizer.step() optimizer.clear_grad() train_batch_cost += time.time() - batch_start diff --git a/tools/train.py b/tools/train.py index 05d295aa9..b34ac9790 100755 --- a/tools/train.py +++ b/tools/train.py @@ -102,6 +102,23 @@ def main(config, device, logger, vdl_writer): if valid_dataloader is not None: logger.info('valid dataloader has {} iters'.format( len(valid_dataloader))) + + use_amp = True if "AMP" in config else False + if use_amp: + AMP_RELATED_FLAGS_SETTING = { + 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, + 'FLAGS_max_inplace_grad_add': 8, + } + paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) + scale_loss = config["AMP"].get("scale_loss", 1.0) + use_dynamic_loss_scaling = config["AMP"].get("use_dynamic_loss_scaling", + False) + scaler = paddle.amp.GradScaler( + init_loss_scaling=scale_loss, + use_dynamic_loss_scaling=use_dynamic_loss_scaling) + else: + scaler = None + # start train program.train(config, train_dataloader, valid_dataloader, device, model, loss_class, optimizer, lr_scheduler, post_process_class, From 8bb9fb7e3d18a23f2e86373c58406a777b04c821 Mon Sep 17 00:00:00 2001 From: stephon Date: Fri, 15 Oct 2021 08:34:27 +0000 Subject: [PATCH 02/10] fix some error --- tools/program.py | 3 ++- tools/train.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/program.py b/tools/program.py index 5963016b6..6456aad5d 100755 --- a/tools/program.py +++ b/tools/program.py @@ -159,7 +159,8 @@ def train(config, eval_class, pre_best_model_dict, logger, - vdl_writer=None): + vdl_writer=None, + scaler=None): cal_metric_during_train = config['Global'].get('cal_metric_during_train', False) log_smooth_window = config['Global']['log_smooth_window'] diff --git a/tools/train.py b/tools/train.py index b34ac9790..49e44112c 100755 --- a/tools/train.py +++ b/tools/train.py @@ -122,7 +122,7 @@ def main(config, device, logger, vdl_writer): # start train program.train(config, train_dataloader, valid_dataloader, device, model, loss_class, optimizer, lr_scheduler, post_process_class, - eval_class, pre_best_model_dict, logger, vdl_writer) + eval_class, pre_best_model_dict, logger, vdl_writer, scaler) def test_reader(config, device, logger): From 987517fd5dff7aae717db66dfaf5fb3e25cf1b53 Mon Sep 17 00:00:00 2001 From: stephon Date: Fri, 15 Oct 2021 09:37:12 +0000 Subject: [PATCH 03/10] add multi_node and amp train in script --- configs/det/det_mv3_db_amp.yml | 4 ++-- tests/configs/ppocr_det_mobile_params.txt | 4 ++-- tests/test_python.sh | 12 ++++++++---- tools/train.py | 8 ++++---- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/configs/det/det_mv3_db_amp.yml b/configs/det/det_mv3_db_amp.yml index 772342a2d..640031752 100644 --- a/configs/det/det_mv3_db_amp.yml +++ b/configs/det/det_mv3_db_amp.yml @@ -14,8 +14,8 @@ Global: use_visualdl: False infer_img: doc/imgs_en/img_10.jpg save_res_path: ./output/det_db/predicts_db.txt - -AMP: + #amp related + use_amp: True scale_loss: 1024.0 use_dynamic_loss_scaling: True diff --git a/tests/configs/ppocr_det_mobile_params.txt b/tests/configs/ppocr_det_mobile_params.txt index 5edb14cdb..be14865c5 100644 --- a/tests/configs/ppocr_det_mobile_params.txt +++ b/tests/configs/ppocr_det_mobile_params.txt @@ -1,9 +1,9 @@ ===========================train_params=========================== model_name:ocr_det python:python3.7 -gpu_list:0|0,1 +gpu_list:0|0,1|10.21.226.181,10.21.226.133;0,1 Global.use_gpu:True|True -Global.auto_cast:null +Global.auto_cast:fp32|amp Global.epoch_num:lite_train_infer=1|whole_train_infer=300 Global.save_model_dir:./output/ Train.loader.batch_size_per_card:lite_train_infer=2|whole_train_infer=4 diff --git a/tests/test_python.sh b/tests/test_python.sh index 39b043b80..26045e174 100644 --- a/tests/test_python.sh +++ b/tests/test_python.sh @@ -253,6 +253,11 @@ else env=" " fi for autocast in ${autocast_list[*]}; do + if [ ${autocast} = "amp" ]; then + set_amp_config="Gloabl.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True" + else + set_amp_config=" " + fi for trainer in ${trainer_list[*]}; do flag_quant=False if [ ${trainer} = ${pact_key} ]; then @@ -279,7 +284,6 @@ else if [ ${run_train} = "null" ]; then continue fi - set_autocast=$(func_set_params "${autocast_key}" "${autocast}") set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}") set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}") @@ -295,11 +299,11 @@ else set_save_model=$(func_set_params "${save_model_key}" "${save_log}") if [ ${#gpu} -le 2 ];then # train with cpu or single gpu - cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} " + cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " elif [ ${#gpu} -le 15 ];then # train with multi-gpu - cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1}" + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" else # train with multi-machine - cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" fi # run train eval "unset CUDA_VISIBLE_DEVICES" diff --git a/tools/train.py b/tools/train.py index 49e44112c..d182af298 100755 --- a/tools/train.py +++ b/tools/train.py @@ -103,16 +103,16 @@ def main(config, device, logger, vdl_writer): logger.info('valid dataloader has {} iters'.format( len(valid_dataloader))) - use_amp = True if "AMP" in config else False + use_amp = config["Global"].get("use_amp", False) if use_amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) - scale_loss = config["AMP"].get("scale_loss", 1.0) - use_dynamic_loss_scaling = config["AMP"].get("use_dynamic_loss_scaling", - False) + scale_loss = config["Global"].get("scale_loss", 1.0) + use_dynamic_loss_scaling = config["Global"].get( + "use_dynamic_loss_scaling", False) scaler = paddle.amp.GradScaler( init_loss_scaling=scale_loss, use_dynamic_loss_scaling=use_dynamic_loss_scaling) From 82f19a312fbc93084b776b1391f44f504031a608 Mon Sep 17 00:00:00 2001 From: stephon Date: Fri, 15 Oct 2021 09:39:13 +0000 Subject: [PATCH 04/10] support cpu train in script --- tests/configs/ppocr_det_mobile_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/configs/ppocr_det_mobile_params.txt b/tests/configs/ppocr_det_mobile_params.txt index be14865c5..c65ad7182 100644 --- a/tests/configs/ppocr_det_mobile_params.txt +++ b/tests/configs/ppocr_det_mobile_params.txt @@ -2,7 +2,7 @@ model_name:ocr_det python:python3.7 gpu_list:0|0,1|10.21.226.181,10.21.226.133;0,1 -Global.use_gpu:True|True +Global.use_gpu:False|True Global.auto_cast:fp32|amp Global.epoch_num:lite_train_infer=1|whole_train_infer=300 Global.save_model_dir:./output/ From 0e534b2928b3967f4928bdca041712b813bbff67 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 20 Oct 2021 10:44:16 +0800 Subject: [PATCH 05/10] Update ppocr_det_mobile_params.txt --- PTDN/configs/ppocr_det_mobile_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PTDN/configs/ppocr_det_mobile_params.txt b/PTDN/configs/ppocr_det_mobile_params.txt index 25c5c0429..ae28c0186 100644 --- a/PTDN/configs/ppocr_det_mobile_params.txt +++ b/PTDN/configs/ppocr_det_mobile_params.txt @@ -2,7 +2,7 @@ model_name:ocr_det python:python3.7 gpu_list:0|0,1|10.21.226.181,10.21.226.133;0,1 -Global.use_gpu:False|True +Global.use_gpu:True|True Global.auto_cast:fp32|amp Global.epoch_num:lite_train_infer=1|whole_train_infer=300 Global.save_model_dir:./output/ From d8afe420bae07b42737a8489cb1fb1c935acdab7 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 20 Oct 2021 10:53:10 +0800 Subject: [PATCH 06/10] Update ppocr_det_mobile_params.txt --- PTDN/configs/ppocr_det_mobile_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PTDN/configs/ppocr_det_mobile_params.txt b/PTDN/configs/ppocr_det_mobile_params.txt index ae28c0186..556092cba 100644 --- a/PTDN/configs/ppocr_det_mobile_params.txt +++ b/PTDN/configs/ppocr_det_mobile_params.txt @@ -2,7 +2,7 @@ model_name:ocr_det python:python3.7 gpu_list:0|0,1|10.21.226.181,10.21.226.133;0,1 -Global.use_gpu:True|True +Global.use_gpu:True|True|T Global.auto_cast:fp32|amp Global.epoch_num:lite_train_infer=1|whole_train_infer=300 Global.save_model_dir:./output/ From fd06628695a143b36493563c706e94cb67a6459c Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 20 Oct 2021 10:53:31 +0800 Subject: [PATCH 07/10] Update ppocr_det_mobile_params.txt --- PTDN/configs/ppocr_det_mobile_params.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PTDN/configs/ppocr_det_mobile_params.txt b/PTDN/configs/ppocr_det_mobile_params.txt index 556092cba..3d2117d7c 100644 --- a/PTDN/configs/ppocr_det_mobile_params.txt +++ b/PTDN/configs/ppocr_det_mobile_params.txt @@ -2,7 +2,7 @@ model_name:ocr_det python:python3.7 gpu_list:0|0,1|10.21.226.181,10.21.226.133;0,1 -Global.use_gpu:True|True|T +Global.use_gpu:True|True|True Global.auto_cast:fp32|amp Global.epoch_num:lite_train_infer=1|whole_train_infer=300 Global.save_model_dir:./output/ From 921c24f54f7021686fa6c4e839658c7a31373b6b Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 20 Oct 2021 16:02:47 +0800 Subject: [PATCH 08/10] Delete det_mv3_db_amp.yml --- configs/det/det_mv3_db_amp.yml | 135 --------------------------------- 1 file changed, 135 deletions(-) delete mode 100644 configs/det/det_mv3_db_amp.yml diff --git a/configs/det/det_mv3_db_amp.yml b/configs/det/det_mv3_db_amp.yml deleted file mode 100644 index 640031752..000000000 --- a/configs/det/det_mv3_db_amp.yml +++ /dev/null @@ -1,135 +0,0 @@ -Global: - use_gpu: true - epoch_num: 1200 - log_smooth_window: 20 - print_batch_step: 10 - save_model_dir: ./output/db_mv3/ - save_epoch_step: 1200 - # evaluation is run every 2000 iterations - eval_batch_step: [0, 2000] - cal_metric_during_train: False - pretrained_model: ./pretrain_models/MobileNetV3_large_x0_5_pretrained - checkpoints: - save_inference_dir: - use_visualdl: False - infer_img: doc/imgs_en/img_10.jpg - save_res_path: ./output/det_db/predicts_db.txt - #amp related - use_amp: True - scale_loss: 1024.0 - use_dynamic_loss_scaling: True - -Architecture: - model_type: det - algorithm: DB - Transform: - Backbone: - name: MobileNetV3 - scale: 0.5 - model_name: large - Neck: - name: DBFPN - out_channels: 256 - Head: - name: DBHead - k: 50 - -Loss: - name: DBLoss - balance_loss: true - main_loss_type: DiceLoss - alpha: 5 - beta: 10 - ohem_ratio: 3 - -Optimizer: - name: Adam - beta1: 0.9 - beta2: 0.999 - lr: - learning_rate: 0.001 - regularizer: - name: 'L2' - factor: 0 - -PostProcess: - name: DBPostProcess - thresh: 0.3 - box_thresh: 0.6 - max_candidates: 1000 - unclip_ratio: 1.5 - -Metric: - name: DetMetric - main_indicator: hmean - -Train: - dataset: - name: SimpleDataSet - data_dir: ./train_data/icdar2015/text_localization/ - label_file_list: - - ./train_data/icdar2015/text_localization/train_icdar2015_label.txt - ratio_list: [1.0] - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - DetLabelEncode: # Class handling label - - IaaAugment: - augmenter_args: - - { 'type': Fliplr, 'args': { 'p': 0.5 } } - - { 'type': Affine, 'args': { 'rotate': [-10, 10] } } - - { 'type': Resize, 'args': { 'size': [0.5, 3] } } - - EastRandomCropData: - size: [640, 640] - max_tries: 50 - keep_ratio: true - - MakeBorderMap: - shrink_ratio: 0.4 - thresh_min: 0.3 - thresh_max: 0.7 - - MakeShrinkMap: - shrink_ratio: 0.4 - min_text_size: 8 - - NormalizeImage: - scale: 1./255. - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: 'hwc' - - ToCHWImage: - - KeepKeys: - keep_keys: ['image', 'threshold_map', 'threshold_mask', 'shrink_map', 'shrink_mask'] # the order of the dataloader list - loader: - shuffle: True - drop_last: False - batch_size_per_card: 16 - num_workers: 8 - use_shared_memory: False - -Eval: - dataset: - name: SimpleDataSet - data_dir: ./train_data/icdar2015/text_localization/ - label_file_list: - - ./train_data/icdar2015/text_localization/test_icdar2015_label.txt - transforms: - - DecodeImage: # load image - img_mode: BGR - channel_first: False - - DetLabelEncode: # Class handling label - - DetResizeForTest: - image_shape: [736, 1280] - - NormalizeImage: - scale: 1./255. - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: 'hwc' - - ToCHWImage: - - KeepKeys: - keep_keys: ['image', 'shape', 'polys', 'ignore_tags'] - loader: - shuffle: False - drop_last: False - batch_size_per_card: 1 # must be 1 - num_workers: 8 - use_shared_memory: False From 03710112b93e4e7da474907ceed8031a8df97602 Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Wed, 20 Oct 2021 22:00:07 +0800 Subject: [PATCH 09/10] Update test_train_inference_python.sh --- PTDN/test_train_inference_python.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PTDN/test_train_inference_python.sh b/PTDN/test_train_inference_python.sh index b34634292..7349eac91 100644 --- a/PTDN/test_train_inference_python.sh +++ b/PTDN/test_train_inference_python.sh @@ -259,7 +259,7 @@ else fi for autocast in ${autocast_list[*]}; do if [ ${autocast} = "amp" ]; then - set_amp_config="Gloabl.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True" + set_amp_config="Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True" else set_amp_config=" " fi From d7ddef7b2f48eda6b22d2850f08189b6d95d8c5b Mon Sep 17 00:00:00 2001 From: Bin Lu Date: Fri, 22 Oct 2021 14:52:51 +0800 Subject: [PATCH 10/10] Update test_train_inference_python.sh --- PTDN/test_train_inference_python.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PTDN/test_train_inference_python.sh b/PTDN/test_train_inference_python.sh index 7349eac91..7ea8665be 100644 --- a/PTDN/test_train_inference_python.sh +++ b/PTDN/test_train_inference_python.sh @@ -239,6 +239,7 @@ else for gpu in ${gpu_list[*]}; do use_gpu=${USE_GPU_KEY[Count]} Count=$(($Count + 1)) + ips="" if [ ${gpu} = "-1" ];then env="" elif [ ${#gpu} -le 1 ];then @@ -305,10 +306,10 @@ else set_save_model=$(func_set_params "${save_model_key}" "${save_log}") if [ ${#gpu} -le 2 ];then # train with cpu or single gpu cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " - elif [ ${#gpu} -le 15 ];then # train with multi-gpu - cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + elif [ ${#ips} -le 26 ];then # train with multi-gpu + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" else # train with multi-machine - cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${set_use_gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" fi # run train eval "unset CUDA_VISIBLE_DEVICES"