diff --git a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
index 4655e02b3..a7697840e 100644
--- a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
+++ b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
@@ -42,11 +42,12 @@ Optimizer:
   no_weight_decay_name: pos_embed cls_token .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1.25e-4
-    eta_min: 1.25e-6
+    learning_rate: 2.5e-4
+    eta_min: 2.5e-6
     warmup_epoch: 20
-    warmup_start_lr: 1.25e-7
+    warmup_start_lr: 2.5e-7
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
index 1e6b1f79f..a7100289c 100644
--- a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
+++ b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
@@ -42,11 +42,12 @@ Optimizer:
   no_weight_decay_name: pos_embed cls_token .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 6.25e-5
-    eta_min: 6.25e-7
+    learning_rate: 1.25e-4
+    eta_min: 1.25e-6
     warmup_epoch: 20
-    warmup_start_lr: 6.25e-8
+    warmup_start_lr: 1.25e-7
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
index ddeacadf0..7c96343df 100644
--- a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
+++ b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
@@ -42,11 +42,12 @@ Optimizer:
   no_weight_decay_name: pos_embed cls_token .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1.25e-4
-    eta_min: 1.25e-6
+    learning_rate: 2.5e-4
+    eta_min: 2.5e-6
     warmup_epoch: 20
-    warmup_start_lr: 1.25e-7
+    warmup_start_lr: 2.5e-7
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
index ab477ef2e..4b682fec6 100644
--- a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
+++ b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
@@ -42,11 +42,12 @@ Optimizer:
   no_weight_decay_name: pos_embed cls_token .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 3.125e-5
-    eta_min: 3.125e-7
+    learning_rate: 6.25e-5
+    eta_min: 6.25e-7
     warmup_epoch: 20
-    warmup_start_lr: 3.125e-8
+    warmup_start_lr: 6.25e-8
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
index ec3c5a145..a191f4160 100644
--- a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
+++ b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
@@ -42,11 +42,12 @@ Optimizer:
   no_weight_decay_name: pos_embed cls_token .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 2.5e-4
-    eta_min: 2.5e-6
+    learning_rate: 5e-4
+    eta_min: 5e-6
     warmup_epoch: 20
-    warmup_start_lr: 2.5e-7
+    warmup_start_lr: 5e-7
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
index 3e3f92525..3a2be2837 100644
--- a/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
+++ b/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
@@ -42,11 +42,12 @@ Optimizer:
   no_weight_decay_name: pos_embed cls_token .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 5e-6
+    learning_rate: 1e-3
+    eta_min: 1e-5
     warmup_epoch: 20
-    warmup_start_lr: 5e-7
+    warmup_start_lr: 1e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml b/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
index 979a04a38..8c3cc4c34 100644
--- a/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
+++ b/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
@@ -40,11 +40,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token pos_embed dist_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
+    learning_rate: 2e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 # data loader for train and eval
 DataLoader:
diff --git a/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml b/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
index 859f57d72..0b8c2e808 100644
--- a/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
+++ b/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
@@ -40,11 +40,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token pos_embed dist_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
+    learning_rate: 2e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 # data loader for train and eval
 DataLoader:
diff --git a/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml b/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
index 3cdd10202..938916caa 100644
--- a/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
+++ b/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
@@ -40,11 +40,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token pos_embed dist_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
+    learning_rate: 2e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 # data loader for train and eval
 DataLoader:
diff --git a/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml b/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
index 88a8fbae9..4cbe6ffde 100644
--- a/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
+++ b/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
@@ -40,11 +40,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token pos_embed dist_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
+    learning_rate: 2e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 # data loader for train and eval
 DataLoader:
diff --git a/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml b/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
index 54d962e68..d5ba0cee7 100644
--- a/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
+++ b/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
@@ -41,10 +41,10 @@ Optimizer:
   one_dim_param_no_weight_decay: True
   lr:
     name: Cosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
+    learning_rate: 2e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 # data loader for train and eval
 DataLoader:
diff --git a/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml b/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
index 05c3ac1f3..a167c896e 100644
--- a/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
+++ b/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
@@ -40,11 +40,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token pos_embed dist_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
+    learning_rate: 2e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 # data loader for train and eval
 DataLoader:
diff --git a/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml b/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
index f66617613..319e17025 100644
--- a/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
+++ b/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
@@ -40,11 +40,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token pos_embed dist_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
+    learning_rate: 2e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 # data loader for train and eval
 DataLoader:
diff --git a/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml b/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
index 647050a77..1234d79b6 100644
--- a/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
+++ b/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
@@ -40,11 +40,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token pos_embed dist_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
+    learning_rate: 2e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 # data loader for train and eval
 DataLoader:
diff --git a/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml b/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
index a7265b066..b230f11cb 100644
--- a/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
+++ b/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
@@ -49,9 +49,8 @@ Loss:
         model_name_pairs:
         - ["Student", "Teacher"]
   Eval:
-    - DistillationGTCELoss:
+    - CELoss:
         weight: 1.0
-        model_names: ["Student"]
         
 
 Optimizer:
diff --git a/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml b/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
index e5b8b7162..000cb9add 100644
--- a/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
+++ b/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
@@ -88,10 +88,8 @@ Loss:
         s_shapes: *s_shapes
         t_shapes: *t_shapes
   Eval:
-    - DistillationGTCELoss:
+    - CELoss:
         weight: 1.0
-        model_names: ["Student"]
-        
 
 Optimizer:
   name: Momentum
diff --git a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
index 6c0854cb4..27fc20b99 100644
--- a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
+++ b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 5e-6
+    learning_rate: 1e-3
+    eta_min: 1e-5
     warmup_epoch: 20
-    warmup_start_lr: 5e-7
+    warmup_start_lr: 1e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
index 42134c740..20fa39773 100644
--- a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
+++ b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 5e-6
+    learning_rate: 1e-3
+    eta_min: 1e-5
     warmup_epoch: 20
-    warmup_start_lr: 5e-7
+    warmup_start_lr: 1e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
index 4d0d5a432..cda94496e 100644
--- a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
+++ b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 5e-6
+    learning_rate: 1e-3
+    eta_min: 1e-5
     warmup_epoch: 20
-    warmup_start_lr: 5e-7
+    warmup_start_lr: 1e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
index a5feb260b..2d48178f0 100644
--- a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
+++ b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 5e-6
+    learning_rate: 1e-3
+    eta_min: 1e-5
     warmup_epoch: 20
-    warmup_start_lr: 5e-7
+    warmup_start_lr: 1e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
index be300aca6..581a70605 100644
--- a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
+++ b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
@@ -44,11 +44,12 @@ Optimizer:
   no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 5e-6
+    learning_rate: 1e-3
+    eta_min: 1e-5
     warmup_epoch: 20
-    warmup_start_lr: 5e-7
+    warmup_start_lr: 1e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
index b6a895339..92da84d1e 100644
--- a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
+++ b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
@@ -44,11 +44,12 @@ Optimizer:
   no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 5e-6
+    learning_rate: 1e-3
+    eta_min: 1e-5
     warmup_epoch: 20
-    warmup_start_lr: 5e-7
+    warmup_start_lr: 1e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
index 9d36b2807..4bb2449a4 100644
--- a/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
+++ b/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
@@ -44,11 +44,12 @@ Optimizer:
   no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 5e-6
+    learning_rate: 1e-3
+    eta_min: 1e-5
     warmup_epoch: 20
-    warmup_start_lr: 5e-7
+    warmup_start_lr: 1e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
index 4dd0ac4cf..afc3fdcd2 100644
--- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
+++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
@@ -41,11 +41,12 @@ Optimizer:
   no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 20
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
index a42dea1f9..4920fae6c 100644
--- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
+++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
@@ -41,11 +41,12 @@ Optimizer:
   no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 20
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
index 36b5e5e38..a6dd74267 100644
--- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
+++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
@@ -41,11 +41,12 @@ Optimizer:
   no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 20
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
index 96a9befd2..564da72f1 100644
--- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
+++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
@@ -41,11 +41,12 @@ Optimizer:
   no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 20
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
index ffbbcf080..ba42f1efb 100644
--- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
+++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
@@ -41,11 +41,12 @@ Optimizer:
   no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 20
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
index 066db715d..26fa0ba61 100644
--- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
+++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -41,11 +41,12 @@ Optimizer:
   no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 20
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
index 74c402ee7..36e5b086d 100644
--- a/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
index ca66e9a33..6e19d6461 100644
--- a/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
index 9e97c0f99..66235960a 100644
--- a/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
index 7831e9289..96745495a 100644
--- a/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
+++ b/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
index 8e160b3c2..ca4baf942 100644
--- a/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
+++ b/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
index 582382d4d..a5e5f7e05 100644
--- a/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
+++ b/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
@@ -43,11 +43,12 @@ Optimizer:
   no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
   one_dim_param_no_weight_decay: True
   lr:
+    # for 8 cards
     name: Cosine
-    learning_rate: 5e-4
-    eta_min: 1e-5
+    learning_rate: 1e-3
+    eta_min: 2e-5
     warmup_epoch: 5
-    warmup_start_lr: 1e-6
+    warmup_start_lr: 2e-6
 
 
 # data loader for train and eval
diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py
index 05151a1bb..ca851c626 100644
--- a/ppcls/engine/engine.py
+++ b/ppcls/engine/engine.py
@@ -262,12 +262,17 @@ class Engine(object):
             self.model_ema = ExponentialMovingAverage(
                 self.model, self.config['EMA'].get("decay", 0.9999))
 
-        # for distributed
+        # check the gpu num
         world_size = dist.get_world_size()
         self.config["Global"]["distributed"] = world_size != 1
-        if world_size != 4 and self.mode == "train":
-            msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train."
-            logger.warning(msg)
+        if self.mode == "train":
+            std_gpu_num = 8 if self.config["Optimizer"][
+                "name"] == "AdamW" else 4
+            if world_size != std_gpu_num:
+                msg = f"The training strategy provided by PaddleClas is based on {std_gpu_num} gpus. But the number of gpu is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use this config to train."
+                logger.warning(msg)
+
+        # for distributed
         if self.config["Global"]["distributed"]:
             dist.init_parallel_env()
             self.model = paddle.DataParallel(self.model)
diff --git a/ppcls/engine/evaluation/classification.py b/ppcls/engine/evaluation/classification.py
index 6e7fc1a76..f4c90a393 100644
--- a/ppcls/engine/evaluation/classification.py
+++ b/ppcls/engine/evaluation/classification.py
@@ -80,22 +80,17 @@ def classification_eval(engine, epoch_id=0):
         current_samples = batch_size * paddle.distributed.get_world_size()
         accum_samples += current_samples
 
+        if isinstance(out, dict) and "Student" in out:
+            out = out["Student"]
+        if isinstance(out, dict) and "logits" in out:
+            out = out["logits"]
+
         # gather Tensor when distributed
         if paddle.distributed.get_world_size() > 1:
             label_list = []
             paddle.distributed.all_gather(label_list, batch[1])
             labels = paddle.concat(label_list, 0)
 
-            if isinstance(out, dict):
-                if "Student" in out:
-                    out = out["Student"]
-                    if isinstance(out, dict):
-                        out = out["logits"]
-                elif "logits" in out:
-                    out = out["logits"]
-                else:
-                    msg = "Error: Wrong key in out!"
-                    raise Exception(msg)
             if isinstance(out, list):
                 preds = []
                 for x in out:
diff --git a/ppcls/loss/deephashloss.py b/ppcls/loss/deephashloss.py
index 959fd11ad..7dda519a8 100644
--- a/ppcls/loss/deephashloss.py
+++ b/ppcls/loss/deephashloss.py
@@ -20,6 +20,7 @@ class DSHSDLoss(nn.Layer):
     """
     # DSHSD(IEEE ACCESS 2019)
     # paper [Deep Supervised Hashing Based on Stable Distribution](https://ieeexplore.ieee.org/document/8648432/)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/DSHSD.py
     """
 
     def __init__(self, alpha, multi_label=False):
@@ -62,6 +63,7 @@ class DSHSDLoss(nn.Layer):
 class LCDSHLoss(nn.Layer):
     """
     # paper [Locality-Constrained Deep Supervised Hashing for Image Retrieval](https://www.ijcai.org/Proceedings/2017/0499.pdf)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/LCDSH.py
     """
 
     def __init__(self, n_class, _lambda):
@@ -100,6 +102,7 @@ class DCHLoss(paddle.nn.Layer):
     """
     # paper [Deep Cauchy Hashing for Hamming Space Retrieval]
     URL:(http://ise.thss.tsinghua.edu.cn/~mlong/doc/deep-cauchy-hashing-cvpr18.pdf)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/DCH.py
     """
 
     def __init__(self, gamma, _lambda, n_class):
diff --git a/ppcls/loss/emlloss.py b/ppcls/loss/emlloss.py
index 973570389..38b707fe1 100644
--- a/ppcls/loss/emlloss.py
+++ b/ppcls/loss/emlloss.py
@@ -23,6 +23,11 @@ from .comfunc import rerange_index
 
 
 class EmlLoss(paddle.nn.Layer):
+    """Ensemble Metric Learning Loss
+    paper: [Large Scale Strongly Supervised Ensemble Metric Learning, with Applications to Face Verification and Retrieval](https://arxiv.org/pdf/1212.6094.pdf)
+    code reference: https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/metric_learning/losses/emlloss.py
+    """
+
     def __init__(self, batch_size=40, samples_each_class=2):
         super(EmlLoss, self).__init__()
         assert (batch_size % samples_each_class == 0)
diff --git a/ppcls/loss/googlenetloss.py b/ppcls/loss/googlenetloss.py
index c580aa617..491311831 100644
--- a/ppcls/loss/googlenetloss.py
+++ b/ppcls/loss/googlenetloss.py
@@ -18,11 +18,13 @@ import paddle.nn.functional as F
 class GoogLeNetLoss(nn.Layer):
     """
     Cross entropy loss used after googlenet
+    reference paper: [https://arxiv.org/pdf/1409.4842v1.pdf](Going Deeper with Convolutions)
     """
+
     def __init__(self, epsilon=None):
         super().__init__()
-        assert (epsilon is None or epsilon <= 0 or epsilon >= 1), "googlenet is not support label_smooth"
-            
+        assert (epsilon is None or epsilon <= 0 or
+                epsilon >= 1), "googlenet is not support label_smooth"
 
     def forward(self, inputs, label):
         input0, input1, input2 = inputs
diff --git a/ppcls/loss/msmloss.py b/ppcls/loss/msmloss.py
index 3aa0dd8bf..adf03ef8e 100644
--- a/ppcls/loss/msmloss.py
+++ b/ppcls/loss/msmloss.py
@@ -21,10 +21,12 @@ from .comfunc import rerange_index
 
 class MSMLoss(paddle.nn.Layer):
     """
-    MSMLoss Loss, based on triplet loss. USE P * K samples.
+    paper : [Margin Sample Mining Loss: A Deep Learning Based Method for Person Re-identification](https://arxiv.org/pdf/1710.00478.pdf)
+    code reference: https://github.com/michuanhaohao/keras_reid/blob/master/reid_tripletcls.py
+    Margin Sample Mining Loss, based on triplet loss. USE P * K samples.
     the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
     same label gather together
-    
+
             supported_metrics = [
             'euclidean',
             'sqeuclidean',
@@ -41,7 +43,7 @@ class MSMLoss(paddle.nn.Layer):
         self.rerange_index = rerange_index(batch_size, samples_each_class)
 
     def forward(self, input, target=None):
-        #normalization 
+        #normalization
         features = input["features"]
         features = self._nomalize(features)
         samples_each_class = self.samples_each_class
@@ -53,7 +55,7 @@ class MSMLoss(paddle.nn.Layer):
                 features, axis=0)
         similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
 
-        #rerange 
+        #rerange
         tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
         tmp = paddle.gather(tmp, index=rerange_index)
         similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])
diff --git a/ppcls/loss/npairsloss.py b/ppcls/loss/npairsloss.py
index d4b359e88..131c799a4 100644
--- a/ppcls/loss/npairsloss.py
+++ b/ppcls/loss/npairsloss.py
@@ -5,6 +5,11 @@ import paddle
 
 
 class NpairsLoss(paddle.nn.Layer):
+    """Npair_loss_
+    paper [Improved deep metric learning with multi-class N-pair loss objective](https://dl.acm.org/doi/10.5555/3157096.3157304)
+    code reference: https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/contrib/losses/metric_learning/npairs_loss
+    """
+
     def __init__(self, reg_lambda=0.01):
         super(NpairsLoss, self).__init__()
         self.reg_lambda = reg_lambda
diff --git a/ppcls/loss/pairwisecosface.py b/ppcls/loss/pairwisecosface.py
index beb806863..7f146dea5 100644
--- a/ppcls/loss/pairwisecosface.py
+++ b/ppcls/loss/pairwisecosface.py
@@ -23,6 +23,11 @@ import paddle.nn.functional as F
 
 
 class PairwiseCosface(nn.Layer):
+    """
+    paper: Circle Loss: A Unified Perspective of Pair Similarity Optimization
+    code reference: https://github.com/leoluopy/circle-loss-demonstration/blob/main/circle_loss.py
+    """
+
     def __init__(self, margin, gamma):
         super(PairwiseCosface, self).__init__()
         self.margin = margin
@@ -36,8 +41,10 @@ class PairwiseCosface(nn.Layer):
         dist_mat = paddle.matmul(embedding, embedding, transpose_y=True)
 
         N = dist_mat.shape[0]
-        is_pos = targets.reshape([N,1]).expand([N,N]).equal(paddle.t(targets.reshape([N,1]).expand([N,N]))).astype('float')
-        is_neg = targets.reshape([N,1]).expand([N,N]).not_equal(paddle.t(targets.reshape([N,1]).expand([N,N]))).astype('float')
+        is_pos = targets.reshape([N, 1]).expand([N, N]).equal(
+            paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float')
+        is_neg = targets.reshape([N, 1]).expand([N, N]).not_equal(
+            paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float')
 
         # Mask scores related to itself
         is_pos = is_pos - paddle.eye(N, N)
@@ -46,10 +53,12 @@ class PairwiseCosface(nn.Layer):
         s_n = dist_mat * is_neg
 
         logit_p = -self.gamma * s_p + (-99999999.) * (1 - is_pos)
-        logit_n = self.gamma * (s_n + self.margin) + (-99999999.) * (1 - is_neg)
+        logit_n = self.gamma * (s_n + self.margin) + (-99999999.) * (1 - is_neg
+                                                                     )
+
+        loss = F.softplus(
+            paddle.logsumexp(
+                logit_p, axis=1) + paddle.logsumexp(
+                    logit_n, axis=1)).mean()
 
-        loss = F.softplus(paddle.logsumexp(logit_p, axis=1) + paddle.logsumexp(logit_n, axis=1)).mean()
-  
         return {"PairwiseCosface": loss}
-
-
diff --git a/ppcls/loss/rkdloss.py b/ppcls/loss/rkdloss.py
index e6ffea273..aa6ae2324 100644
--- a/ppcls/loss/rkdloss.py
+++ b/ppcls/loss/rkdloss.py
@@ -29,6 +29,7 @@ def pdist(e, squared=False, eps=1e-12):
 
 
 class RKdAngle(nn.Layer):
+    # paper : [Relational Knowledge Distillation](https://arxiv.org/abs/1904.05068?context=cs.LG)
     # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py
     def __init__(self, target_size=None):
         super().__init__()
@@ -64,6 +65,7 @@ class RKdAngle(nn.Layer):
 
 
 class RkdDistance(nn.Layer):
+    # paper : [Relational Knowledge Distillation](https://arxiv.org/abs/1904.05068?context=cs.LG)
     # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py
     def __init__(self, eps=1e-12, target_size=1):
         super().__init__()
diff --git a/ppcls/loss/supconloss.py b/ppcls/loss/supconloss.py
index 3dd33bc19..753ceaf41 100644
--- a/ppcls/loss/supconloss.py
+++ b/ppcls/loss/supconloss.py
@@ -4,6 +4,7 @@ from paddle import nn
 
 class SupConLoss(nn.Layer):
     """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+    code reference: https://github.com/HobbitLong/SupContrast/blob/master/losses.py
     It also supports the unsupervised contrastive loss in SimCLR"""
 
     def __init__(self,
diff --git a/ppcls/loss/trihardloss.py b/ppcls/loss/trihardloss.py
index 132c604d5..96cb42cb4 100644
--- a/ppcls/loss/trihardloss.py
+++ b/ppcls/loss/trihardloss.py
@@ -22,10 +22,12 @@ from .comfunc import rerange_index
 
 class TriHardLoss(paddle.nn.Layer):
     """
+    paper: In Defense of the Triplet Loss for Person Re-Identification
+    code reference: https://github.com/VisualComputingInstitute/triplet-reid/blob/master/loss.py
     TriHard Loss, based on triplet loss. USE P * K samples.
     the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
     same label gather together
-    
+
             supported_metrics = [
             'euclidean',
             'sqeuclidean',
@@ -45,7 +47,7 @@ class TriHardLoss(paddle.nn.Layer):
         features = input["features"]
         assert (self.batch_size == features.shape[0])
 
-        #normalization 
+        #normalization
         features = self._nomalize(features)
         samples_each_class = self.samples_each_class
         rerange_index = paddle.to_tensor(self.rerange_index)
@@ -56,7 +58,7 @@ class TriHardLoss(paddle.nn.Layer):
                 features, axis=0)
         similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
 
-        #rerange 
+        #rerange
         tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
         tmp = paddle.gather(tmp, index=rerange_index)
         similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])
diff --git a/ppcls/loss/triplet.py b/ppcls/loss/triplet.py
index d1c7eec9e..458ee2e27 100644
--- a/ppcls/loss/triplet.py
+++ b/ppcls/loss/triplet.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -8,6 +22,8 @@ import paddle.nn as nn
 
 class TripletLossV2(nn.Layer):
     """Triplet loss with hard positive/negative mining.
+    paper : [Facenet: A unified embedding for face recognition and clustering](https://arxiv.org/pdf/1503.03832.pdf)
+    code reference: https://github.com/okzhili/Cartoon-face-recognition/blob/master/loss/triplet_loss.py
     Args:
         margin (float): margin for triplet.
     """
diff --git a/ppcls/optimizer/__init__.py b/ppcls/optimizer/__init__.py
index d27f1100e..44d7b5ac0 100644
--- a/ppcls/optimizer/__init__.py
+++ b/ppcls/optimizer/__init__.py
@@ -118,8 +118,6 @@ def build_optimizer(config, epochs, step_each_epoch, model_list=None):
                     if hasattr(model_list[i], optim_scope):
                         optim_model.append(getattr(model_list[i], optim_scope))
 
-        assert len(optim_model) == 1, \
-            "Invalid optim model for optim scope({}), number of optim_model={}".format(optim_scope, len(optim_model))
         optim = getattr(optimizer, optim_name)(
             learning_rate=lr, grad_clip=grad_clip,
             **optim_cfg)(model_list=optim_model)
diff --git a/test_tipc/config/CSWinTransformer/CSWinTransformer_tiny_224_train_infer_python.txt b/test_tipc/config/CSWinTransformer/CSWinTransformer_tiny_224_train_infer_python.txt
index 11b2f9dd9..03f5e3eed 100644
--- a/test_tipc/config/CSWinTransformer/CSWinTransformer_tiny_224_train_infer_python.txt
+++ b/test_tipc/config/CSWinTransformer/CSWinTransformer_tiny_224_train_infer_python.txt
@@ -13,14 +13,14 @@ train_infer_img_dir:./dataset/ILSVRC2012/val
 null:null
 ##
 trainer:norm_train
-norm_train:tools/train.py -c ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
+norm_train:tools/train.py -c ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o Global.print_batch_step=1
 pact_train:null
 fpgm_train:null
 distill_train:null
 null:null
 null:null
 ##
-===========================eval_params=========================== 
+===========================eval_params===========================
 eval:tools/eval.py -c ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
 null:null
 ##
diff --git a/test_tipc/config/MobileViT/MobileViT_S_train_infer_python.txt b/test_tipc/config/MobileViT/MobileViT_S_train_infer_python.txt
index d69f26412..06fda8fe6 100644
--- a/test_tipc/config/MobileViT/MobileViT_S_train_infer_python.txt
+++ b/test_tipc/config/MobileViT/MobileViT_S_train_infer_python.txt
@@ -13,14 +13,14 @@ train_infer_img_dir:./dataset/ILSVRC2012/val
 null:null
 ##
 trainer:norm_train
-norm_train:tools/train.py -c ppcls/configs/ImageNet/MobileViT/MobileViT_S.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
+norm_train:tools/train.py -c ppcls/configs/ImageNet/MobileViT/MobileViT_S.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o Global.print_batch_step=1
 pact_train:null
 fpgm_train:null
 distill_train:null
 null:null
 null:null
 ##
-===========================eval_params=========================== 
+===========================eval_params===========================
 eval:tools/eval.py -c ppcls/configs/ImageNet/MobileViT/MobileViT_S.yaml
 null:null
 ##
diff --git a/test_tipc/config/PVTV2/PVT_V2_B2_Linear_train_infer_python.txt b/test_tipc/config/PVTV2/PVT_V2_B2_Linear_train_infer_python.txt
index b2aa7df69..f50107fea 100644
--- a/test_tipc/config/PVTV2/PVT_V2_B2_Linear_train_infer_python.txt
+++ b/test_tipc/config/PVTV2/PVT_V2_B2_Linear_train_infer_python.txt
@@ -13,7 +13,7 @@ train_infer_img_dir:./dataset/ILSVRC2012/val
 null:null
 ##
 trainer:norm_train
-norm_train:tools/train.py -c ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
+norm_train:tools/train.py -c ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o Global.print_batch_step=1
 pact_train:null
 fpgm_train:null
 distill_train:null
diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh
index c5be87074..70040dc8b 100644
--- a/test_tipc/prepare.sh
+++ b/test_tipc/prepare.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 FILENAME=$1
 
-# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',  
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',
 #                 'whole_infer', 'klquant_whole_infer',
 #                 'cpp_infer', 'serving_infer',  'lite_infer']
 
@@ -67,9 +67,9 @@ if [ ${MODE} = "cpp_infer" ];then
 	    model_dir=${tar_name%.*}
 	    eval "tar xf ${tar_name}"
 	    eval "mv ${model_dir} ${cls_inference_model_dir}"
-	    
+
 	    eval "wget -nc $det_inference_url"
-	    tar_name=$(func_get_url_file_name "$det_inference_url") 
+	    tar_name=$(func_get_url_file_name "$det_inference_url")
 	    model_dir=${tar_name%.*}
 	    eval "tar xf ${tar_name}"
 	    eval "mv ${model_dir} ${det_inference_model_dir}"
@@ -120,7 +120,7 @@ if [ ${MODE} = "lite_train_lite_infer" ] || [ ${MODE} = "lite_train_whole_infer"
     wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_little_train.tar
     tar xf whole_chain_little_train.tar
     ln -s whole_chain_little_train ILSVRC2012
-    cd ILSVRC2012 
+    cd ILSVRC2012
     mv train.txt train_list.txt
     mv val.txt val_list.txt
     cp -r train/* val/
@@ -132,7 +132,7 @@ elif [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ];then
     wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_infer.tar
     tar xf whole_chain_infer.tar
     ln -s whole_chain_infer ILSVRC2012
-    cd ILSVRC2012 
+    cd ILSVRC2012
     mv val.txt val_list.txt
     ln -s val_list.txt train_list.txt
     cd ../../
@@ -153,7 +153,7 @@ elif [ ${MODE} = "whole_train_whole_infer" ];then
     wget -nc https://paddle-imagenet-models-name.bj.bcebos.com/data/whole_chain/whole_chain_CIFAR100.tar
     tar xf whole_chain_CIFAR100.tar
     ln -s whole_chain_CIFAR100 ILSVRC2012
-    cd ILSVRC2012 
+    cd ILSVRC2012
     mv train.txt train_list.txt
     mv test.txt val_list.txt
     cd ../../