From 88d0d4ca9f17974f7d85146e6fabbd773e980724 Mon Sep 17 00:00:00 2001
From: cuicheng01 <cuicheng_smile@163.com>
Date: Wed, 21 Jul 2021 09:32:58 +0000
Subject: [PATCH 1/3] Update Twins configs

---
 ppcls/arch/backbone/model_zoo/gvt.py          |   8 +-
 .../configs/ImageNet/Twins/alt_gvt_base.yaml  | 132 ++++++++++++++++++
 .../configs/ImageNet/Twins/alt_gvt_large.yaml | 132 ++++++++++++++++++
 .../configs/ImageNet/Twins/alt_gvt_small.yaml | 132 ++++++++++++++++++
 ppcls/configs/ImageNet/Twins/pcpvt_base.yaml  | 132 ++++++++++++++++++
 ppcls/configs/ImageNet/Twins/pcpvt_large.yaml | 132 ++++++++++++++++++
 ppcls/configs/ImageNet/Twins/pcpvt_small.yaml | 132 ++++++++++++++++++
 7 files changed, 796 insertions(+), 4 deletions(-)
 create mode 100644 ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
 create mode 100644 ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
 create mode 100644 ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
 create mode 100644 ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
 create mode 100644 ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
 create mode 100644 ppcls/configs/ImageNet/Twins/pcpvt_small.yaml

diff --git a/ppcls/arch/backbone/model_zoo/gvt.py b/ppcls/arch/backbone/model_zoo/gvt.py
index 8453cc27a..1818540b3 100644
--- a/ppcls/arch/backbone/model_zoo/gvt.py
+++ b/ppcls/arch/backbone/model_zoo/gvt.py
@@ -433,7 +433,7 @@ class CPVTV2(PyramidVisionTransformer):
                  img_size=224,
                  patch_size=4,
                  in_chans=3,
-                 num_classes=1000,
+                 class_num=1000,
                  embed_dims=[64, 128, 256, 512],
                  num_heads=[1, 2, 4, 8],
                  mlp_ratios=[4, 4, 4, 4],
@@ -446,7 +446,7 @@ class CPVTV2(PyramidVisionTransformer):
                  depths=[3, 4, 6, 3],
                  sr_ratios=[8, 4, 2, 1],
                  block_cls=Block):
-        super().__init__(img_size, patch_size, in_chans, num_classes,
+        super().__init__(img_size, patch_size, in_chans, class_num,
                          embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale,
                          drop_rate, attn_drop_rate, drop_path_rate, norm_layer,
                          depths, sr_ratios, block_cls)
@@ -499,7 +499,7 @@ class PCPVT(CPVTV2):
                  img_size=224,
                  patch_size=4,
                  in_chans=3,
-                 num_classes=1000,
+                 class_num=1000,
                  embed_dims=[64, 128, 256],
                  num_heads=[1, 2, 4],
                  mlp_ratios=[4, 4, 4],
@@ -512,7 +512,7 @@ class PCPVT(CPVTV2):
                  depths=[4, 4, 4],
                  sr_ratios=[4, 2, 1],
                  block_cls=SBlock):
-        super().__init__(img_size, patch_size, in_chans, num_classes,
+        super().__init__(img_size, patch_size, in_chans, class_num,
                          embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale,
                          drop_rate, attn_drop_rate, drop_path_rate, norm_layer,
                          depths, sr_ratios, block_cls)
diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
new file mode 100644
index 000000000..7c06a3ba1
--- /dev/null
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
new file mode 100644
index 000000000..4a56a8ee2
--- /dev/null
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_large
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
new file mode 100644
index 000000000..78cc263f2
--- /dev/null
+++ b/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_small
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
new file mode 100644
index 000000000..100e87a9f
--- /dev/null
+++ b/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: pcpvt_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
new file mode 100644
index 000000000..ad7b4df54
--- /dev/null
+++ b/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: pcpvt_large
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
new file mode 100644
index 000000000..dff588cc0
--- /dev/null
+++ b/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: pcpvt_small
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]

From e07674603ebe3bee505afa8737d6df2df0459345 Mon Sep 17 00:00:00 2001
From: Tingquan Gao <35441050@qq.com>
Date: Wed, 21 Jul 2021 20:50:04 +0800
Subject: [PATCH 2/3] Update gvt.py

---
 ppcls/arch/backbone/model_zoo/gvt.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ppcls/arch/backbone/model_zoo/gvt.py b/ppcls/arch/backbone/model_zoo/gvt.py
index 1818540b3..c3171228d 100644
--- a/ppcls/arch/backbone/model_zoo/gvt.py
+++ b/ppcls/arch/backbone/model_zoo/gvt.py
@@ -78,9 +78,9 @@ class GroupAttention(nn.Layer):
         total_groups = h_group * w_group
         x = x.reshape([B, h_group, self.ws, w_group, self.ws, C]).transpose(
             [0, 1, 3, 2, 4, 5])
-        qkv = self.qkv(x).reshape(
-            [B, total_groups, -1, 3, self.num_heads,
-             C // self.num_heads]).transpose([3, 0, 1, 4, 2, 5])
+        qkv = self.qkv(x).reshape([
+            B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads
+        ]).transpose([3, 0, 1, 4, 2, 5])
         q, k, v = qkv[0], qkv[1], qkv[2]
         attn = (q @k.transpose([0, 1, 2, 4, 3])) * self.scale
 
@@ -135,14 +135,15 @@ class Attention(nn.Layer):
 
         if self.sr_ratio > 1:
             x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
-            x_ = self.sr(x_).reshape([B, C, -1]).transpose([0, 2, 1])
+            tmp_n = H * W // self.sr_ratio**2
+            x_ = self.sr(x_).reshape([B, C, tmp_n]).transpose([0, 2, 1])
             x_ = self.norm(x_)
             kv = self.kv(x_).reshape(
-                [B, -1, 2, self.num_heads, C // self.num_heads]).transpose(
+                [B, tmp_n, 2, self.num_heads, C // self.num_heads]).transpose(
                     [2, 0, 3, 1, 4])
         else:
             kv = self.kv(x).reshape(
-                [B, -1, 2, self.num_heads, C // self.num_heads]).transpose(
+                [B, N, 2, self.num_heads, C // self.num_heads]).transpose(
                     [2, 0, 3, 1, 4])
         k, v = kv[0], kv[1]
 
@@ -317,7 +318,6 @@ class PyramidVisionTransformer(nn.Layer):
                 self.create_parameter(
                     shape=[1, patch_num, embed_dims[i]],
                     default_initializer=zeros_))
-            self.add_parameter(f"pos_embeds_{i}", self.pos_embeds[i])
             self.pos_drops.append(nn.Dropout(p=drop_rate))
 
         dpr = [
@@ -433,7 +433,7 @@ class CPVTV2(PyramidVisionTransformer):
                  img_size=224,
                  patch_size=4,
                  in_chans=3,
-                 class_num=1000,
+                 num_classes=1000,
                  embed_dims=[64, 128, 256, 512],
                  num_heads=[1, 2, 4, 8],
                  mlp_ratios=[4, 4, 4, 4],
@@ -446,7 +446,7 @@ class CPVTV2(PyramidVisionTransformer):
                  depths=[3, 4, 6, 3],
                  sr_ratios=[8, 4, 2, 1],
                  block_cls=Block):
-        super().__init__(img_size, patch_size, in_chans, class_num,
+        super().__init__(img_size, patch_size, in_chans, num_classes,
                          embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale,
                          drop_rate, attn_drop_rate, drop_path_rate, norm_layer,
                          depths, sr_ratios, block_cls)
@@ -488,7 +488,7 @@ class CPVTV2(PyramidVisionTransformer):
                     x = self.pos_block[i](x, H, W)  # PEG here
 
             if i < len(self.depths) - 1:
-                x = x.reshape([B, H, W, -1]).transpose([0, 3, 1, 2])
+                x = x.reshape([B, H, W, x.shape[-1]]).transpose([0, 3, 1, 2])
 
         x = self.norm(x)
         return x.mean(axis=1)  # GAP here
@@ -499,7 +499,7 @@ class PCPVT(CPVTV2):
                  img_size=224,
                  patch_size=4,
                  in_chans=3,
-                 class_num=1000,
+                 num_classes=1000,
                  embed_dims=[64, 128, 256],
                  num_heads=[1, 2, 4],
                  mlp_ratios=[4, 4, 4],
@@ -512,7 +512,7 @@ class PCPVT(CPVTV2):
                  depths=[4, 4, 4],
                  sr_ratios=[4, 2, 1],
                  block_cls=SBlock):
-        super().__init__(img_size, patch_size, in_chans, class_num,
+        super().__init__(img_size, patch_size, in_chans, num_classes,
                          embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale,
                          drop_rate, attn_drop_rate, drop_path_rate, norm_layer,
                          depths, sr_ratios, block_cls)

From 3a8715240c5754d3f33828147102f04f789e92b5 Mon Sep 17 00:00:00 2001
From: cuicheng01 <cuicheng_smile@163.com>
Date: Wed, 21 Jul 2021 13:16:23 +0000
Subject: [PATCH 3/3] Update gvt.py

---
 ppcls/arch/backbone/model_zoo/gvt.py | 38 ++++++++++++++--------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/ppcls/arch/backbone/model_zoo/gvt.py b/ppcls/arch/backbone/model_zoo/gvt.py
index c3171228d..eb5643b63 100644
--- a/ppcls/arch/backbone/model_zoo/gvt.py
+++ b/ppcls/arch/backbone/model_zoo/gvt.py
@@ -56,10 +56,10 @@ class GroupAttention(nn.Layer):
                  ws=1):
         super().__init__()
         if ws == 1:
-            raise Exception(f"ws {ws} should not be 1")
+            raise Exception("ws {ws} should not be 1")
         if dim % num_heads != 0:
             raise Exception(
-                f"dim {dim} should be divided by num_heads {num_heads}.")
+                "dim {dim} should be divided by num_heads {num_heads}.")
 
         self.dim = dim
         self.num_heads = num_heads
@@ -82,11 +82,11 @@ class GroupAttention(nn.Layer):
             B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads
         ]).transpose([3, 0, 1, 4, 2, 5])
         q, k, v = qkv[0], qkv[1], qkv[2]
-        attn = (q @k.transpose([0, 1, 2, 4, 3])) * self.scale
+        attn = (q @ k.transpose([0, 1, 2, 4, 3])) * self.scale
 
         attn = nn.Softmax(axis=-1)(attn)
         attn = self.attn_drop(attn)
-        attn = (attn @v).transpose([0, 1, 3, 2, 4]).reshape(
+        attn = (attn @ v).transpose([0, 1, 3, 2, 4]).reshape(
             [B, h_group, w_group, self.ws, self.ws, C])
 
         x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C])
@@ -147,11 +147,11 @@ class Attention(nn.Layer):
                     [2, 0, 3, 1, 4])
         k, v = kv[0], kv[1]
 
-        attn = (q @k.transpose([0, 1, 3, 2])) * self.scale
+        attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale
         attn = nn.Softmax(axis=-1)(attn)
         attn = self.attn_drop(attn)
 
-        x = (attn @v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C])
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
@@ -281,7 +281,7 @@ class PyramidVisionTransformer(nn.Layer):
                  img_size=224,
                  patch_size=16,
                  in_chans=3,
-                 num_classes=1000,
+                 class_num=1000,
                  embed_dims=[64, 128, 256, 512],
                  num_heads=[1, 2, 4, 8],
                  mlp_ratios=[4, 4, 4, 4],
@@ -295,7 +295,7 @@ class PyramidVisionTransformer(nn.Layer):
                  sr_ratios=[8, 4, 2, 1],
                  block_cls=Block):
         super().__init__()
-        self.num_classes = num_classes
+        self.class_num = class_num
         self.depths = depths
 
         # patch_embed
@@ -354,7 +354,7 @@ class PyramidVisionTransformer(nn.Layer):
 
         # classification head
         self.head = nn.Linear(embed_dims[-1],
-                              num_classes) if num_classes > 0 else Identity()
+                              class_num) if class_num > 0 else Identity()
 
         # init weights
         for pos_emb in self.pos_embeds:
@@ -433,7 +433,7 @@ class CPVTV2(PyramidVisionTransformer):
                  img_size=224,
                  patch_size=4,
                  in_chans=3,
-                 num_classes=1000,
+                 class_num=1000,
                  embed_dims=[64, 128, 256, 512],
                  num_heads=[1, 2, 4, 8],
                  mlp_ratios=[4, 4, 4, 4],
@@ -446,10 +446,10 @@ class CPVTV2(PyramidVisionTransformer):
                  depths=[3, 4, 6, 3],
                  sr_ratios=[8, 4, 2, 1],
                  block_cls=Block):
-        super().__init__(img_size, patch_size, in_chans, num_classes,
-                         embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale,
-                         drop_rate, attn_drop_rate, drop_path_rate, norm_layer,
-                         depths, sr_ratios, block_cls)
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
         del self.pos_embeds
         del self.cls_token
         self.pos_block = nn.LayerList(
@@ -499,7 +499,7 @@ class PCPVT(CPVTV2):
                  img_size=224,
                  patch_size=4,
                  in_chans=3,
-                 num_classes=1000,
+                 class_num=1000,
                  embed_dims=[64, 128, 256],
                  num_heads=[1, 2, 4],
                  mlp_ratios=[4, 4, 4],
@@ -512,10 +512,10 @@ class PCPVT(CPVTV2):
                  depths=[4, 4, 4],
                  sr_ratios=[4, 2, 1],
                  block_cls=SBlock):
-        super().__init__(img_size, patch_size, in_chans, num_classes,
-                         embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale,
-                         drop_rate, attn_drop_rate, drop_path_rate, norm_layer,
-                         depths, sr_ratios, block_cls)
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
 
 
 class ALTGVT(PCPVT):