From 24a51ae06493b95501563c5ccc170421dd56edcd Mon Sep 17 00:00:00 2001
From: JamesLim-sy <santamonic@sina.com>
Date: Tue, 4 Jan 2022 07:20:31 +0000
Subject: [PATCH 01/14] first

---
 ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml                  | 3 +++
 ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml            | 2 +-
 ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml            | 2 +-
 ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml | 2 +-
 ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml       | 2 +-
 5 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml b/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
index 096910472..6124fe26d 100644
--- a/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
+++ b/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
@@ -1,3 +1,6 @@
+# Conv config
+export FLAGS_cudnn_exhaustive_search=True
+
 # global configs
 Global:
   checkpoints: null
diff --git a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
index 53d1d1dd2..401eaa944 100644
--- a/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
@@ -71,7 +71,7 @@ DataLoader:
       drop_last: False
       shuffle: True
     loader:
-      num_workers: 4
+      num_workers: 8
       use_shared_memory: True
 
   Eval:
diff --git a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
index fc0e61bb6..143885d26 100644
--- a/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
@@ -69,7 +69,7 @@ DataLoader:
       drop_last: False
       shuffle: True
     loader:
-      num_workers: 4
+      num_workers: 8
       use_shared_memory: True
 
   Eval:
diff --git a/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml b/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
index 56544d0f3..f3ca83870 100644
--- a/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
+++ b/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
@@ -70,7 +70,7 @@ DataLoader:
       drop_last: False
       shuffle: True
     loader:
-      num_workers: 4
+      num_workers: 8
       use_shared_memory: True
 
   Eval:
diff --git a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
index dd4820da4..22c8d63a0 100644
--- a/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
+++ b/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
@@ -68,7 +68,7 @@ DataLoader:
       drop_last: False
       shuffle: True
     loader:
-      num_workers: 4
+      num_workers: 8
       use_shared_memory: True
 
   Eval:

From f01586064595fbab9783ace37b83c655c8ec4f1e Mon Sep 17 00:00:00 2001
From: JamesLim-sy <santamonic@sina.com>
Date: Wed, 5 Jan 2022 13:15:19 +0000
Subject: [PATCH 02/14] drop useless settings

---
 ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml b/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
index 6124fe26d..096910472 100644
--- a/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
+++ b/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
@@ -1,6 +1,3 @@
-# Conv config
-export FLAGS_cudnn_exhaustive_search=True
-
 # global configs
 Global:
   checkpoints: null

From 8525e7ed3fb098537d89e696c26393f3cfc2c859 Mon Sep 17 00:00:00 2001
From: lubin10 <lubin10@baidu.com>
Date: Fri, 21 Jan 2022 06:31:58 +0000
Subject: [PATCH 03/14] add insatll nvidia_dali in prepare.sh

---
 .../ResNet/ResNet50_vd_train_use_dali.txt     | 22 +++++++++++++++++++
 test_tipc/prepare.sh                          |  5 +++++
 2 files changed, 27 insertions(+)
 create mode 100644 test_tipc/config/ResNet/ResNet50_vd_train_use_dali.txt

diff --git a/test_tipc/config/ResNet/ResNet50_vd_train_use_dali.txt b/test_tipc/config/ResNet/ResNet50_vd_train_use_dali.txt
new file mode 100644
index 000000000..9d4bf7f34
--- /dev/null
+++ b/test_tipc/config/ResNet/ResNet50_vd_train_use_dali.txt
@@ -0,0 +1,22 @@
+===========================train_params===========================
+model_name:ResNet50_vd
+python:python3.7
+gpu_list:0|0,1
+-o Global.device:gpu
+-o Global.auto_cast:null|amp
+-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
+-o Global.output_dir:./output/
+-o DataLoader.Train.sampler.batch_size:8
+-o Global.pretrained_model:null
+train_model_name:latest
+train_infer_img_dir:./dataset/ILSVRC2012/val
+null:null
+##
+trainer:norm_train
+norm_train:tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o Global.use_dali=True
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh
index 646e7f4d8..18892fb43 100644
--- a/test_tipc/prepare.sh
+++ b/test_tipc/prepare.sh
@@ -108,6 +108,11 @@ if [[ $FILENAME == *GeneralRecognition* ]];then
    exit 0
 fi
 
+if [[ $FILENAME == *use_dali* ]];then
+    python_name=$(func_parser_value "${lines[2]}")
+    ${python_name} -m pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/nightly --upgrade nvidia-dali-nightly-cuda102
+fi
+
 if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer" ];then
     # pretrain lite train data
     cd dataset

From cdbff19e2238e2b58b14c48642e61af66516aaf5 Mon Sep 17 00:00:00 2001
From: lubin10 <lubin10@baidu.com>
Date: Mon, 24 Jan 2022 11:13:17 +0000
Subject: [PATCH 04/14] rename dali test configure

---
 ...rain_use_dali.txt => ResNet50_vd_train_linux_gpu_use_dali.txt} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test_tipc/config/ResNet/{ResNet50_vd_train_use_dali.txt => ResNet50_vd_train_linux_gpu_use_dali.txt} (100%)

diff --git a/test_tipc/config/ResNet/ResNet50_vd_train_use_dali.txt b/test_tipc/config/ResNet/ResNet50_vd_train_linux_gpu_use_dali.txt
similarity index 100%
rename from test_tipc/config/ResNet/ResNet50_vd_train_use_dali.txt
rename to test_tipc/config/ResNet/ResNet50_vd_train_linux_gpu_use_dali.txt

From 7040ce8314b7202b9e800e27f5399e2feb94387b Mon Sep 17 00:00:00 2001
From: gaotingquan <gaotingquan@baidu.com>
Date: Tue, 11 Jan 2022 08:48:03 +0000
Subject: [PATCH 05/14] refactor: change params to be consistent with amp

---
 ..._fp16_dygraph.yaml => ResNet50_amp_O1.yaml} |  9 ++++-----
 ...ResNet50_fp16.yaml => ResNet50_amp_O2.yaml} | 13 +++++++------
 ...16.yaml => SE_ResNeXt101_32x4d_amp_O2.yaml} | 10 ++++++----
 ppcls/engine/engine.py                         | 12 +++++++++---
 ppcls/engine/evaluation/classification.py      | 18 +++++++++++-------
 ppcls/engine/train/train.py                    | 10 ++++++----
 ppcls/static/program.py                        |  4 ++--
 ppcls/static/run_dali.sh                       |  9 +++------
 ppcls/static/train.py                          |  2 +-
 9 files changed, 49 insertions(+), 38 deletions(-)
 rename ppcls/configs/ImageNet/ResNet/{ResNet50_fp16_dygraph.yaml => ResNet50_amp_O1.yaml} (94%)
 rename ppcls/configs/ImageNet/ResNet/{ResNet50_fp16.yaml => ResNet50_amp_O2.yaml} (94%)
 rename ppcls/configs/ImageNet/SENet/{SE_ResNeXt101_32x4d_fp16.yaml => SE_ResNeXt101_32x4d_amp_O2.yaml} (95%)

diff --git a/ppcls/configs/ImageNet/ResNet/ResNet50_fp16_dygraph.yaml b/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
similarity index 94%
rename from ppcls/configs/ImageNet/ResNet/ResNet50_fp16_dygraph.yaml
rename to ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
index bbd35083b..53e9ae213 100644
--- a/ppcls/configs/ImageNet/ResNet/ResNet50_fp16_dygraph.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
@@ -22,7 +22,8 @@ Global:
 AMP:
   scale_loss: 128.0
   use_dynamic_loss_scaling: True
-  use_pure_fp16: &use_pure_fp16 False
+  # O1: mixed fp16
+  level: O1
 
 # model architecture
 Arch:
@@ -44,6 +45,7 @@ Loss:
 Optimizer:
   name: Momentum
   momentum: 0.9
+  multi_precision: True
   lr:
     name: Piecewise
     learning_rate: 0.1
@@ -74,12 +76,11 @@ DataLoader:
             mean: [0.485, 0.456, 0.406]
             std: [0.229, 0.224, 0.225]
             order: ''
-            output_fp16: *use_pure_fp16
             channel_num: *image_channel
 
     sampler:
       name: DistributedBatchSampler
-      batch_size: 256
+      batch_size: 64
       drop_last: False
       shuffle: True
     loader:
@@ -104,7 +105,6 @@ DataLoader:
             mean: [0.485, 0.456, 0.406]
             std: [0.229, 0.224, 0.225]
             order: ''
-            output_fp16: *use_pure_fp16
             channel_num: *image_channel
     sampler:
       name: DistributedBatchSampler
@@ -131,7 +131,6 @@ Infer:
         mean: [0.485, 0.456, 0.406]
         std: [0.229, 0.224, 0.225]
         order: ''
-        output_fp16: *use_pure_fp16
         channel_num: *image_channel
     - ToCHWImage:
   PostProcess:
diff --git a/ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml b/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2.yaml
similarity index 94%
rename from ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml
rename to ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2.yaml
index 1dccec121..6a4425b40 100644
--- a/ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2.yaml
@@ -10,8 +10,8 @@ Global:
   epochs: 120
   print_batch_step: 10
   use_visualdl: False
-  # used for static mode and model export
   image_channel: &image_channel 4
+  # used for static mode and model export
   image_shape: [*image_channel, 224, 224]
   save_inference_dir: ./inference
   # training model under @to_static
@@ -22,7 +22,8 @@ Global:
 AMP:
   scale_loss: 128.0
   use_dynamic_loss_scaling: True
-  use_pure_fp16: &use_pure_fp16 True
+  # O2: pure fp16
+  level: O2
 
 # model architecture
 Arch:
@@ -43,7 +44,7 @@ Loss:
 Optimizer:
   name: Momentum
   momentum: 0.9
-  multi_precision: *use_pure_fp16
+  multi_precision: True
   lr:
     name: Piecewise
     learning_rate: 0.1
@@ -74,7 +75,7 @@ DataLoader:
             mean: [0.485, 0.456, 0.406]
             std: [0.229, 0.224, 0.225]
             order: ''
-            output_fp16: *use_pure_fp16
+            output_fp16: True
             channel_num: *image_channel
 
     sampler:
@@ -104,7 +105,7 @@ DataLoader:
             mean: [0.485, 0.456, 0.406]
             std: [0.229, 0.224, 0.225]
             order: ''
-            output_fp16: *use_pure_fp16
+            output_fp16: True
             channel_num: *image_channel
     sampler:
       name: DistributedBatchSampler
@@ -131,7 +132,7 @@ Infer:
         mean: [0.485, 0.456, 0.406]
         std: [0.229, 0.224, 0.225]
         order: ''
-        output_fp16: *use_pure_fp16
+        output_fp16: True
         channel_num: *image_channel
     - ToCHWImage:
   PostProcess:
diff --git a/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_fp16.yaml b/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
similarity index 95%
rename from ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_fp16.yaml
rename to ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
index 8140424a5..1824cc1fd 100644
--- a/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_fp16.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
@@ -35,11 +35,13 @@ Loss:
 AMP:
     scale_loss: 128.0
     use_dynamic_loss_scaling: True
-    use_pure_fp16: &use_pure_fp16 True
+    # O2: pure fp16
+    level: O2
 
 Optimizer:
   name: Momentum
   momentum: 0.9
+  multi_precision: True
   lr:
     name: Cosine
     learning_rate: 0.1
@@ -67,7 +69,7 @@ DataLoader:
             mean: [0.485, 0.456, 0.406]
             std: [0.229, 0.224, 0.225]
             order: ''
-            output_fp16: *use_pure_fp16
+            output_fp16: True
             channel_num: *image_channel
     sampler:
       name: DistributedBatchSampler
@@ -96,7 +98,7 @@ DataLoader:
             mean: [0.485, 0.456, 0.406]
             std: [0.229, 0.224, 0.225]
             order: ''
-            output_fp16: *use_pure_fp16
+            output_fp16: True
             channel_num: *image_channel
     sampler:
       name: BatchSampler
@@ -123,7 +125,7 @@ Infer:
         mean: [0.485, 0.456, 0.406]
         std: [0.229, 0.224, 0.225]
         order: ''
-        output_fp16: *use_pure_fp16
+        output_fp16: True
         channel_num: *image_channel
     - ToCHWImage:
   PostProcess:
diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py
index c098c25b6..0a78fdd1a 100644
--- a/ppcls/engine/engine.py
+++ b/ppcls/engine/engine.py
@@ -211,14 +211,20 @@ class Engine(object):
             self.optimizer, self.lr_sch = build_optimizer(
                 self.config["Optimizer"], self.config["Global"]["epochs"],
                 len(self.train_dataloader), [self.model])
-        
+
         # for amp training
         if self.amp:
             self.scaler = paddle.amp.GradScaler(
                 init_loss_scaling=self.scale_loss,
                 use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
-            if self.config['AMP']['use_pure_fp16'] is True:
-                self.model = paddle.amp.decorate(models=self.model, level='O2', save_dtype='float32')
+            amp_level = self.config['AMP'].get("level", "O1")
+            if amp_level not in ["O1", "O2"]:
+                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
+                logger.warning(msg)
+                self.config['AMP']["level"] = "O1"
+                amp_level = "O1"
+            self.model = paddle.amp.decorate(
+                models=self.model, level=amp_level, save_dtype='float32')
 
         # for distributed
         self.config["Global"][
diff --git a/ppcls/engine/evaluation/classification.py b/ppcls/engine/evaluation/classification.py
index 71c531a80..d7b5c4762 100644
--- a/ppcls/engine/evaluation/classification.py
+++ b/ppcls/engine/evaluation/classification.py
@@ -56,13 +56,15 @@ def classification_eval(engine, epoch_id=0):
         batch[0] = paddle.to_tensor(batch[0]).astype("float32")
         if not engine.config["Global"].get("use_multilabel", False):
             batch[1] = batch[1].reshape([-1, 1]).astype("int64")
-        
+
         # image input
         if engine.amp:
-            amp_level = 'O1'
-            if engine.config['AMP']['use_pure_fp16'] is True:
-                amp_level = 'O2'
-            with paddle.amp.auto_cast(custom_black_list={"flatten_contiguous_range", "greater_than"}, level=amp_level):
+            amp_level = engine.config['AMP'].get("level", "O1").upper()
+            with paddle.amp.auto_cast(
+                    custom_black_list={
+                        "flatten_contiguous_range", "greater_than"
+                    },
+                    level=amp_level):
                 out = engine.model(batch[0])
                 # calc loss
                 if engine.eval_loss_func is not None:
@@ -70,7 +72,8 @@ def classification_eval(engine, epoch_id=0):
                     for key in loss_dict:
                         if key not in output_info:
                             output_info[key] = AverageMeter(key, '7.5f')
-                        output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+                        output_info[key].update(loss_dict[key].numpy()[0],
+                                                batch_size)
         else:
             out = engine.model(batch[0])
             # calc loss
@@ -79,7 +82,8 @@ def classification_eval(engine, epoch_id=0):
                 for key in loss_dict:
                     if key not in output_info:
                         output_info[key] = AverageMeter(key, '7.5f')
-                    output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+                    output_info[key].update(loss_dict[key].numpy()[0],
+                                            batch_size)
 
         # just for DistributedBatchSampler issue: repeat sampling
         current_samples = batch_size * paddle.distributed.get_world_size()
diff --git a/ppcls/engine/train/train.py b/ppcls/engine/train/train.py
index b7fa9d3a0..3b02bac8f 100644
--- a/ppcls/engine/train/train.py
+++ b/ppcls/engine/train/train.py
@@ -42,10 +42,12 @@ def train_epoch(engine, epoch_id, print_batch_step):
 
         # image input
         if engine.amp:
-            amp_level = 'O1'
-            if engine.config['AMP']['use_pure_fp16'] is True:
-                amp_level = 'O2'
-            with paddle.amp.auto_cast(custom_black_list={"flatten_contiguous_range", "greater_than"}, level=amp_level):
+            amp_level = engine.config['AMP'].get("level", "O1").upper()
+            with paddle.amp.auto_cast(
+                    custom_black_list={
+                        "flatten_contiguous_range", "greater_than"
+                    },
+                    level=amp_level):
                 out = forward(engine, batch)
                 loss_dict = engine.train_loss_func(out, batch[1])
         else:
diff --git a/ppcls/static/program.py b/ppcls/static/program.py
index 7ecf6ee59..b3534a2cf 100644
--- a/ppcls/static/program.py
+++ b/ppcls/static/program.py
@@ -158,7 +158,7 @@ def create_strategy(config):
     exec_strategy.num_threads = 1
     exec_strategy.num_iteration_per_drop_scope = (
         10000
-        if 'AMP' in config and config.AMP.get("use_pure_fp16", False) else 10)
+        if 'AMP' in config and config.AMP.get("level", "O1") == "O2" else 10)
 
     fuse_op = True if 'AMP' in config else False
 
@@ -206,7 +206,7 @@ def mixed_precision_optimizer(config, optimizer):
         scale_loss = amp_cfg.get('scale_loss', 1.0)
         use_dynamic_loss_scaling = amp_cfg.get('use_dynamic_loss_scaling',
                                                False)
-        use_pure_fp16 = amp_cfg.get('use_pure_fp16', False)
+        use_pure_fp16 = amp_cfg.get("level", "O1") == "O2"
         optimizer = paddle.static.amp.decorate(
             optimizer,
             init_loss_scaling=scale_loss,
diff --git a/ppcls/static/run_dali.sh b/ppcls/static/run_dali.sh
index 748ac84c7..5bf0ef4ca 100644
--- a/ppcls/static/run_dali.sh
+++ b/ppcls/static/run_dali.sh
@@ -1,11 +1,8 @@
 #!/usr/bin/env bash
 
-export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
-export FLAGS_fraction_of_gpu_memory_to_use=0.80
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
 
 python3.7 -m paddle.distributed.launch \
-    --gpus="0,1,2,3,4,5,6,7" \
+    --gpus="0,1,2,3" \
     ppcls/static/train.py \
-    -c ./ppcls/configs/ImageNet/ResNet/ResNet50_fp16.yaml \
-    -o Global.use_dali=True
-
+    -c ./ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
diff --git a/ppcls/static/train.py b/ppcls/static/train.py
index ae2b44052..9c03598be 100644
--- a/ppcls/static/train.py
+++ b/ppcls/static/train.py
@@ -158,7 +158,7 @@ def main(args):
     # load pretrained models or checkpoints
     init_model(global_config, train_prog, exe)
 
-    if 'AMP' in config and config.AMP.get("use_pure_fp16", False):
+    if 'AMP' in config and config.AMP.get("level", "O1") == "O2":
         optimizer.amp_init(
             device,
             scope=paddle.static.global_scope(),

From 10c93c55d1a8605833e7ee828f6a2ebb27cca5af Mon Sep 17 00:00:00 2001
From: gaotingquan <gaotingquan@baidu.com>
Date: Tue, 11 Jan 2022 14:07:09 +0000
Subject: [PATCH 06/14] fix: enable amp only in training

---
 .../ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml       | 1 +
 ppcls/engine/engine.py                                   | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml b/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
index 1824cc1fd..da005d329 100644
--- a/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
+++ b/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
@@ -20,6 +20,7 @@ Arch:
   name: SE_ResNeXt101_32x4d
   class_num: 1000
   input_image_channel: *image_channel
+  data_format: "NHWC"
  
 # loss function config for traing/eval process
 Loss:
diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py
index 0a78fdd1a..a58ac1e93 100644
--- a/ppcls/engine/engine.py
+++ b/ppcls/engine/engine.py
@@ -97,7 +97,7 @@ class Engine(object):
             paddle.__version__, self.device))
 
         # AMP training
-        self.amp = True if "AMP" in self.config else False
+        self.amp = True if "AMP" in self.config and self.mode == "train" else False
         if self.amp and self.config["AMP"] is not None:
             self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
             self.use_dynamic_loss_scaling = self.config["AMP"].get(
@@ -223,8 +223,11 @@ class Engine(object):
                 logger.warning(msg)
                 self.config['AMP']["level"] = "O1"
                 amp_level = "O1"
-            self.model = paddle.amp.decorate(
-                models=self.model, level=amp_level, save_dtype='float32')
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=amp_level,
+                save_dtype='float32')
 
         # for distributed
         self.config["Global"][

From 58dccfd02457aa410fef1c983e59affbff865741 Mon Sep 17 00:00:00 2001
From: Tingquan Gao <gaotingquan@baidu.com>
Date: Fri, 14 Jan 2022 11:40:58 +0000
Subject: [PATCH 07/14] fix

---
 deploy/utils/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/utils/config.py b/deploy/utils/config.py
index eb7914806..7e7ffb79c 100644
--- a/deploy/utils/config.py
+++ b/deploy/utils/config.py
@@ -122,7 +122,7 @@ def override(dl, ks, v):
         if len(ks) == 1:
             # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
             if not ks[0] in dl:
-                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))
+                logger.warning('A new filed ({}) detected!'.format(ks[0]))
             dl[ks[0]] = str2num(v)
         else:
             override(dl[ks[0]], ks[1:], v)

From 8f0bd5b58296d56d9cd230e104ce8bc395360701 Mon Sep 17 00:00:00 2001
From: Tingquan Gao <gaotingquan@baidu.com>
Date: Wed, 19 Jan 2022 05:57:21 +0000
Subject: [PATCH 08/14] fix: fix vdl makedir

---
 ppcls/engine/engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py
index a58ac1e93..f86b092e3 100644
--- a/ppcls/engine/engine.py
+++ b/ppcls/engine/engine.py
@@ -84,7 +84,8 @@ class Engine(object):
 
         # for visualdl
         self.vdl_writer = None
-        if self.config['Global']['use_visualdl'] and mode == "train":
+        if self.config['Global'][
+                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
             vdl_writer_path = os.path.join(self.output_dir, "vdl")
             if not os.path.exists(vdl_writer_path):
                 os.makedirs(vdl_writer_path)

From bb6581d21bc18a1fbc7d5cc379f89818e826be11 Mon Sep 17 00:00:00 2001
From: Tingquan Gao <gaotingquan@baidu.com>
Date: Wed, 19 Jan 2022 06:26:01 +0000
Subject: [PATCH 09/14] refactor: raise warning when gpu numbers is not 4

---
 ppcls/engine/engine.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py
index f86b092e3..fc01de94c 100644
--- a/ppcls/engine/engine.py
+++ b/ppcls/engine/engine.py
@@ -231,11 +231,13 @@ class Engine(object):
                 save_dtype='float32')
 
         # for distributed
-        self.config["Global"][
-            "distributed"] = paddle.distributed.get_world_size() != 1
+        world_size = dist.get_world_size()
+        self.config["Global"]["distributed"] = world_size != 1
+        if world_size != 4 and self.mode == "train":
+            msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train."
+            logger.warning(msg)
         if self.config["Global"]["distributed"]:
             dist.init_parallel_env()
-        if self.config["Global"]["distributed"]:
             self.model = paddle.DataParallel(self.model)
 
         # build postprocess for infer
@@ -346,8 +348,8 @@ class Engine(object):
     @paddle.no_grad()
     def infer(self):
         assert self.mode == "infer" and self.eval_mode == "classification"
-        total_trainer = paddle.distributed.get_world_size()
-        local_rank = paddle.distributed.get_rank()
+        total_trainer = dist.get_world_size()
+        local_rank = dist.get_rank()
         image_list = get_image_list(self.config["Infer"]["infer_imgs"])
         # data split
         image_list = image_list[local_rank::total_trainer]

From 0d22017e2e00a5c55011b3753a5fbfa4bf9a3a06 Mon Sep 17 00:00:00 2001
From: Tingquan Gao <gaotingquan@baidu.com>
Date: Wed, 19 Jan 2022 08:02:45 +0000
Subject: [PATCH 10/14] fix: move class_num from Global to Arch

---
 ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml | 2 +-
 ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml  | 2 +-
 ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml | 2 +-
 ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml b/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml
index 9243cebf9..b34ba075c 100644
--- a/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml
+++ b/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml
@@ -4,7 +4,6 @@ Global:
   pretrained_model: null
   output_dir: ./output/
   device: gpu
-  class_num: 1000
   save_interval: 1
   eval_during_train: True
   eval_interval: 1
@@ -17,6 +16,7 @@ Global:
 # model architecture
 Arch:
   name: ESNet_x0_25
+  class_num: 1000
  
 # loss function config for traing/eval process
 Loss:
diff --git a/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml b/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml
index fa217f75e..0b82e0879 100644
--- a/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml
+++ b/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml
@@ -4,7 +4,6 @@ Global:
   pretrained_model: null
   output_dir: ./output/
   device: gpu
-  class_num: 1000
   save_interval: 1
   eval_during_train: True
   eval_interval: 1
@@ -17,6 +16,7 @@ Global:
 # model architecture
 Arch:
   name: ESNet_x0_5
+  class_num: 1000
  
 # loss function config for traing/eval process
 Loss:
diff --git a/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml b/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml
index 54b888d7c..76623973f 100644
--- a/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml
+++ b/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml
@@ -4,7 +4,6 @@ Global:
   pretrained_model: null
   output_dir: ./output/
   device: gpu
-  class_num: 1000
   save_interval: 1
   eval_during_train: True
   eval_interval: 1
@@ -17,6 +16,7 @@ Global:
 # model architecture
 Arch:
   name: ESNet_x0_75
+  class_num: 1000
  
 # loss function config for traing/eval process
 Loss:
diff --git a/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml b/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml
index ae99f9992..583efd2eb 100644
--- a/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml
+++ b/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml
@@ -4,7 +4,6 @@ Global:
   pretrained_model: null
   output_dir: ./output/
   device: gpu
-  class_num: 1000
   save_interval: 1
   eval_during_train: True
   eval_interval: 1
@@ -17,6 +16,7 @@ Global:
 # model architecture
 Arch:
   name: ESNet_x1_0
+  class_num: 1000
  
 # loss function config for traing/eval process
 Loss:

From 42134cd8ddaa4508c58a4bbc7b173d093a708ade Mon Sep 17 00:00:00 2001
From: Tingquan Gao <gaotingquan@baidu.com>
Date: Wed, 19 Jan 2022 08:29:20 +0000
Subject: [PATCH 11/14] fix: raise warning when using Global.class_num

---
 ppcls/engine/engine.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py
index fc01de94c..7f04221c8 100644
--- a/ppcls/engine/engine.py
+++ b/ppcls/engine/engine.py
@@ -113,6 +113,14 @@ class Engine(object):
             }
             paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
 
+        if "class_num" in config["Global"]:
+            global_class_num = config["Global"]["class_num"]
+            if "class_num" not in config["Arch"]:
+                config["Arch"]["class_num"] = global_class_num
+                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
+            else:
+                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
+            logger.warning(msg)
         #TODO(gaotingquan): support rec
         class_num = config["Arch"].get("class_num", None)
         self.config["DataLoader"].update({"class_num": class_num})

From 855f1385a71ef6d38680b0f1e2de062ce077094c Mon Sep 17 00:00:00 2001
From: Tingquan Gao <gaotingquan@baidu.com>
Date: Fri, 21 Jan 2022 09:12:50 +0000
Subject: [PATCH 12/14] docs: fix

---
 docs/zh_CN/advanced_tutorials/theseus_layer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/zh_CN/advanced_tutorials/theseus_layer.md b/docs/zh_CN/advanced_tutorials/theseus_layer.md
index 0c3f1c555..56f2c9717 100644
--- a/docs/zh_CN/advanced_tutorials/theseus_layer.md
+++ b/docs/zh_CN/advanced_tutorials/theseus_layer.md
@@ -187,7 +187,7 @@ print("The result returned by update_res(): ", res)
 output = net(pd_input)
 print("The output's keys of processed net: ", output.keys())
 # The output's keys of net:  dict_keys(['output', 'blocks[0]', 'blocks[2]', 'blocks[4]', 'blocks[10]'])
-# 网络前向输出 output 为 dict 类型对象，其中，output["key"] 为网络最终输出，output["blocks[0]"] 等为网络中间层输出结果
+# 网络前向输出 output 为 dict 类型对象，其中，output["output"] 为网络最终输出，output["blocks[0]"] 等为网络中间层输出结果
 ```
 
 除了通过调用方法 `update_res()` 的方式之外，也同样可以在实例化网络对象时，通过指定参数 `return_patterns` 实现相同效果：

From ca8af5147200769564c706d07a3d4307ecbe1ed6 Mon Sep 17 00:00:00 2001
From: Tingquan Gao <gaotingquan@baidu.com>
Date: Fri, 21 Jan 2022 09:18:41 +0000
Subject: [PATCH 13/14] docs: update wechat qr code

---
 README_ch.md | 2 +-
 README_en.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README_ch.md b/README_ch.md
index a797e3f72..72801f577 100644
--- a/README_ch.md
+++ b/README_ch.md
@@ -37,7 +37,7 @@ Res2Net200_vd预训练模型Top-1精度高达85.1%。
 * 您可以扫描下面的微信群二维码， 加入PaddleClas 微信交流群。获得更高效的问题答疑，与各行各业开发者充分交流，期待您的加入。
 
 <div align="center">
-<img src="./docs/images/wx_group.png"  width = "200" />
+<img src="https://user-images.githubusercontent.com/12560511/150500411-fdb27d17-0c50-4ac1-a484-fb4a9c2454b3.jpg"  width = "200" />
 </div>
 
 ## 快速体验
diff --git a/README_en.md b/README_en.md
index 47abd67e7..6c1bab962 100644
--- a/README_en.md
+++ b/README_en.md
@@ -8,7 +8,7 @@ PaddleClas is an image recognition toolset for industry and academia, helping us
 
 **Recent updates**
 
-- 2021.09.17 Add PP-LCNet series model developed by PaddleClas, these models show strong competitiveness on Intel CPUs. 
+- 2021.09.17 Add PP-LCNet series model developed by PaddleClas, these models show strong competitiveness on Intel CPUs.
 For the introduction of PP-LCNet, please refer to [paper](https://arxiv.org/pdf/2109.15099.pdf) or [PP-LCNet model introduction](docs/en/models/PP-LCNet_en.md). The metrics and pretrained model are available [here](docs/en/ImageNet_models_en.md).
 
 - 2021.06.29 Add Swin-transformer series model，Highest top1 acc on ImageNet1k dataset reaches 87.2%, training, evaluation and inference are all supported. Pretrained models can be downloaded [here](docs/en/models/models_intro_en.md).
@@ -41,7 +41,7 @@ Four sample solutions are provided, including product recognition, vehicle recog
 * You can also scan the QR code below to join the PaddleClas WeChat group to get more efficient answers to your questions and to communicate with developers from all walks of life. We look forward to hearing from you.
 
 <div align="center">
-<img src="./docs/images/wx_group.png"  width = "200" />
+<img src="https://user-images.githubusercontent.com/12560511/150500411-fdb27d17-0c50-4ac1-a484-fb4a9c2454b3.jpg"  width = "200" />
 </div>
 
 ## Quick Start

From 0924a169416bfd5395737908188683bf9567cbea Mon Sep 17 00:00:00 2001
From: dongshuilong <dongshuilong@baidu.com>
Date: Tue, 25 Jan 2022 19:35:38 +0800
Subject: [PATCH 14/14] add pp-shitu c++ link in readme

---
 README_ch.md | 2 +-
 README_en.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README_ch.md b/README_ch.md
index 72801f577..eeeec1a7a 100644
--- a/README_ch.md
+++ b/README_ch.md
@@ -71,7 +71,7 @@ PP-ShiTu图像识别快速体验：[点击这里](./docs/zh_CN/quick_start/quick
     - [模型导出](./docs/zh_CN/inference_deployment/export_model.md)
     - Python/C++ 预测引擎
       - [基于Python预测引擎预测推理](./docs/zh_CN/inference_deployment/python_deploy.md)
-      - [基于C++预测引擎预测推理](./docs/zh_CN/inference_deployment/cpp_deploy.md)(当前只支持图像分类任务，图像识别更新中)
+      - [基于C++分类预测引擎预测推理](./docs/zh_CN/inference_deployment/cpp_deploy.md)、[基于C++的PP-ShiTu预测引擎预测推理](deploy/cpp_shitu/readme.md)
     - 服务化部署
       - [Paddle Serving服务化部署(推荐)](./docs/zh_CN/inference_deployment/paddle_serving_deploy.md)
       - [Hub serving服务化部署](./docs/zh_CN/inference_deployment/paddle_hub_serving_deploy.md)
diff --git a/README_en.md b/README_en.md
index 6c1bab962..4163880f4 100644
--- a/README_en.md
+++ b/README_en.md
@@ -68,7 +68,7 @@ Quick experience of image recognition：[Link](./docs/en/tutorials/quick_start_r
     - [Feature Learning](./docs/en/tutorials/getting_started_retrieval_en.md)
 - Inference Model Prediction
     - [Python Inference](./docs/en/inference.md)
-    - [C++ Inference](./deploy/cpp/readme_en.md)(only support classification for now, recognition coming soon)
+    - [C++ Classfication Inference](./deploy/cpp/readme_en.md)， [C++ PP-ShiTu Inference](deploy/cpp_shitu/readme_en.md)
 - Model Deploy (only support classification for now, recognition coming soon)
     - [Hub Serving Deployment](./deploy/hubserving/readme_en.md)
     - [Mobile Deployment](./deploy/lite/readme_en.md)