From 88d0d4ca9f17974f7d85146e6fabbd773e980724 Mon Sep 17 00:00:00 2001 From: cuicheng01 Date: Wed, 21 Jul 2021 09:32:58 +0000 Subject: [PATCH 1/3] Update Twins configs --- ppcls/arch/backbone/model_zoo/gvt.py | 8 +- .../configs/ImageNet/Twins/alt_gvt_base.yaml | 132 ++++++++++++++++++ .../configs/ImageNet/Twins/alt_gvt_large.yaml | 132 ++++++++++++++++++ .../configs/ImageNet/Twins/alt_gvt_small.yaml | 132 ++++++++++++++++++ ppcls/configs/ImageNet/Twins/pcpvt_base.yaml | 132 ++++++++++++++++++ ppcls/configs/ImageNet/Twins/pcpvt_large.yaml | 132 ++++++++++++++++++ ppcls/configs/ImageNet/Twins/pcpvt_small.yaml | 132 ++++++++++++++++++ 7 files changed, 796 insertions(+), 4 deletions(-) create mode 100644 ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml create mode 100644 ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml create mode 100644 ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml create mode 100644 ppcls/configs/ImageNet/Twins/pcpvt_base.yaml create mode 100644 ppcls/configs/ImageNet/Twins/pcpvt_large.yaml create mode 100644 ppcls/configs/ImageNet/Twins/pcpvt_small.yaml diff --git a/ppcls/arch/backbone/model_zoo/gvt.py b/ppcls/arch/backbone/model_zoo/gvt.py index 8453cc27a..1818540b3 100644 --- a/ppcls/arch/backbone/model_zoo/gvt.py +++ b/ppcls/arch/backbone/model_zoo/gvt.py @@ -433,7 +433,7 @@ class CPVTV2(PyramidVisionTransformer): img_size=224, patch_size=4, in_chans=3, - num_classes=1000, + class_num=1000, embed_dims=[64, 128, 256, 512], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], @@ -446,7 +446,7 @@ class CPVTV2(PyramidVisionTransformer): depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], block_cls=Block): - super().__init__(img_size, patch_size, in_chans, num_classes, + super().__init__(img_size, patch_size, in_chans, class_num, embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, depths, sr_ratios, block_cls) @@ -499,7 +499,7 @@ class PCPVT(CPVTV2): img_size=224, patch_size=4, in_chans=3, - num_classes=1000, + class_num=1000, embed_dims=[64, 128, 256], num_heads=[1, 2, 4], mlp_ratios=[4, 4, 4], @@ -512,7 +512,7 @@ class PCPVT(CPVTV2): depths=[4, 4, 4], sr_ratios=[4, 2, 1], block_cls=SBlock): - super().__init__(img_size, patch_size, in_chans, num_classes, + super().__init__(img_size, patch_size, in_chans, class_num, embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, depths, sr_ratios, block_cls) diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml new file mode 100644 index 000000000..7c06a3ba1 --- /dev/null +++ b/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml @@ -0,0 +1,132 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + eval_during_train: True + eval_interval: 1 + epochs: 120 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + # training model under @to_static + to_static: False + +# model architecture +Arch: + name: alt_gvt_base + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: Piecewise + learning_rate: 0.1 + decay_epochs: [30, 60, 90] + values: [0.1, 0.01, 0.001, 0.0001] + regularizer: + name: 'L2' + coeff: 0.0001 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Infer: + infer_imgs: docs/images/whl/demo.jpg + batch_size: 10 + transforms: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + PostProcess: + name: Topk + topk: 5 + class_id_map_file: ppcls/utils/imagenet1k_label_list.txt + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml new file mode 100644 index 000000000..4a56a8ee2 --- /dev/null +++ b/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml @@ -0,0 +1,132 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + eval_during_train: True + eval_interval: 1 + epochs: 120 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + # training model under @to_static + to_static: False + +# model architecture +Arch: + name: alt_gvt_large + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: Piecewise + learning_rate: 0.1 + decay_epochs: [30, 60, 90] + values: [0.1, 0.01, 0.001, 0.0001] + regularizer: + name: 'L2' + coeff: 0.0001 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Infer: + infer_imgs: docs/images/whl/demo.jpg + batch_size: 10 + transforms: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + PostProcess: + name: Topk + topk: 5 + class_id_map_file: ppcls/utils/imagenet1k_label_list.txt + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] diff --git a/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml b/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml new file mode 100644 index 000000000..78cc263f2 --- /dev/null +++ b/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml @@ -0,0 +1,132 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + eval_during_train: True + eval_interval: 1 + epochs: 120 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + # training model under @to_static + to_static: False + +# model architecture +Arch: + name: alt_gvt_small + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: Piecewise + learning_rate: 0.1 + decay_epochs: [30, 60, 90] + values: [0.1, 0.01, 0.001, 0.0001] + regularizer: + name: 'L2' + coeff: 0.0001 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Infer: + infer_imgs: docs/images/whl/demo.jpg + batch_size: 10 + transforms: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + PostProcess: + name: Topk + topk: 5 + class_id_map_file: ppcls/utils/imagenet1k_label_list.txt + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml new file mode 100644 index 000000000..100e87a9f --- /dev/null +++ b/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml @@ -0,0 +1,132 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + eval_during_train: True + eval_interval: 1 + epochs: 120 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + # training model under @to_static + to_static: False + +# model architecture +Arch: + name: pcpvt_base + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: Piecewise + learning_rate: 0.1 + decay_epochs: [30, 60, 90] + values: [0.1, 0.01, 0.001, 0.0001] + regularizer: + name: 'L2' + coeff: 0.0001 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Infer: + infer_imgs: docs/images/whl/demo.jpg + batch_size: 10 + transforms: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + PostProcess: + name: Topk + topk: 5 + class_id_map_file: ppcls/utils/imagenet1k_label_list.txt + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml new file mode 100644 index 000000000..ad7b4df54 --- /dev/null +++ b/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml @@ -0,0 +1,132 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + eval_during_train: True + eval_interval: 1 + epochs: 120 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + # training model under @to_static + to_static: False + +# model architecture +Arch: + name: pcpvt_large + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: Piecewise + learning_rate: 0.1 + decay_epochs: [30, 60, 90] + values: [0.1, 0.01, 0.001, 0.0001] + regularizer: + name: 'L2' + coeff: 0.0001 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Infer: + infer_imgs: docs/images/whl/demo.jpg + batch_size: 10 + transforms: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + PostProcess: + name: Topk + topk: 5 + class_id_map_file: ppcls/utils/imagenet1k_label_list.txt + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] diff --git a/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml b/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml new file mode 100644 index 000000000..dff588cc0 --- /dev/null +++ b/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml @@ -0,0 +1,132 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + eval_during_train: True + eval_interval: 1 + epochs: 120 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + # training model under @to_static + to_static: False + +# model architecture +Arch: + name: pcpvt_small + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + + +Optimizer: + name: Momentum + momentum: 0.9 + lr: + name: Piecewise + learning_rate: 0.1 + decay_epochs: [30, 60, 90] + values: [0.1, 0.01, 0.001, 0.0001] + regularizer: + name: 'L2' + coeff: 0.0001 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 64 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Infer: + infer_imgs: docs/images/whl/demo.jpg + batch_size: 10 + transforms: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + PostProcess: + name: Topk + topk: 5 + class_id_map_file: ppcls/utils/imagenet1k_label_list.txt + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] From e07674603ebe3bee505afa8737d6df2df0459345 Mon Sep 17 00:00:00 2001 From: Tingquan Gao <35441050@qq.com> Date: Wed, 21 Jul 2021 20:50:04 +0800 Subject: [PATCH 2/3] Update gvt.py --- ppcls/arch/backbone/model_zoo/gvt.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ppcls/arch/backbone/model_zoo/gvt.py b/ppcls/arch/backbone/model_zoo/gvt.py index 1818540b3..c3171228d 100644 --- a/ppcls/arch/backbone/model_zoo/gvt.py +++ b/ppcls/arch/backbone/model_zoo/gvt.py @@ -78,9 +78,9 @@ class GroupAttention(nn.Layer): total_groups = h_group * w_group x = x.reshape([B, h_group, self.ws, w_group, self.ws, C]).transpose( [0, 1, 3, 2, 4, 5]) - qkv = self.qkv(x).reshape( - [B, total_groups, -1, 3, self.num_heads, - C // self.num_heads]).transpose([3, 0, 1, 4, 2, 5]) + qkv = self.qkv(x).reshape([ + B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads + ]).transpose([3, 0, 1, 4, 2, 5]) q, k, v = qkv[0], qkv[1], qkv[2] attn = (q @k.transpose([0, 1, 2, 4, 3])) * self.scale @@ -135,14 +135,15 @@ class Attention(nn.Layer): if self.sr_ratio > 1: x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W]) - x_ = self.sr(x_).reshape([B, C, -1]).transpose([0, 2, 1]) + tmp_n = H * W // self.sr_ratio**2 + x_ = self.sr(x_).reshape([B, C, tmp_n]).transpose([0, 2, 1]) x_ = self.norm(x_) kv = self.kv(x_).reshape( - [B, -1, 2, self.num_heads, C // self.num_heads]).transpose( + [B, tmp_n, 2, self.num_heads, C // self.num_heads]).transpose( [2, 0, 3, 1, 4]) else: kv = self.kv(x).reshape( - [B, -1, 2, self.num_heads, C // self.num_heads]).transpose( + [B, N, 2, self.num_heads, C // self.num_heads]).transpose( [2, 0, 3, 1, 4]) k, v = kv[0], kv[1] @@ -317,7 +318,6 @@ class PyramidVisionTransformer(nn.Layer): self.create_parameter( shape=[1, patch_num, embed_dims[i]], default_initializer=zeros_)) - self.add_parameter(f"pos_embeds_{i}", self.pos_embeds[i]) self.pos_drops.append(nn.Dropout(p=drop_rate)) dpr = [ @@ -433,7 +433,7 @@ class CPVTV2(PyramidVisionTransformer): img_size=224, patch_size=4, in_chans=3, - class_num=1000, + num_classes=1000, embed_dims=[64, 128, 256, 512], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], @@ -446,7 +446,7 @@ class CPVTV2(PyramidVisionTransformer): depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], block_cls=Block): - super().__init__(img_size, patch_size, in_chans, class_num, + super().__init__(img_size, patch_size, in_chans, num_classes, embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, depths, sr_ratios, block_cls) @@ -488,7 +488,7 @@ class CPVTV2(PyramidVisionTransformer): x = self.pos_block[i](x, H, W) # PEG here if i < len(self.depths) - 1: - x = x.reshape([B, H, W, -1]).transpose([0, 3, 1, 2]) + x = x.reshape([B, H, W, x.shape[-1]]).transpose([0, 3, 1, 2]) x = self.norm(x) return x.mean(axis=1) # GAP here @@ -499,7 +499,7 @@ class PCPVT(CPVTV2): img_size=224, patch_size=4, in_chans=3, - class_num=1000, + num_classes=1000, embed_dims=[64, 128, 256], num_heads=[1, 2, 4], mlp_ratios=[4, 4, 4], @@ -512,7 +512,7 @@ class PCPVT(CPVTV2): depths=[4, 4, 4], sr_ratios=[4, 2, 1], block_cls=SBlock): - super().__init__(img_size, patch_size, in_chans, class_num, + super().__init__(img_size, patch_size, in_chans, num_classes, embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, depths, sr_ratios, block_cls) From 3a8715240c5754d3f33828147102f04f789e92b5 Mon Sep 17 00:00:00 2001 From: cuicheng01 Date: Wed, 21 Jul 2021 13:16:23 +0000 Subject: [PATCH 3/3] Update gvt.py --- ppcls/arch/backbone/model_zoo/gvt.py | 38 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/ppcls/arch/backbone/model_zoo/gvt.py b/ppcls/arch/backbone/model_zoo/gvt.py index c3171228d..eb5643b63 100644 --- a/ppcls/arch/backbone/model_zoo/gvt.py +++ b/ppcls/arch/backbone/model_zoo/gvt.py @@ -56,10 +56,10 @@ class GroupAttention(nn.Layer): ws=1): super().__init__() if ws == 1: - raise Exception(f"ws {ws} should not be 1") + raise Exception("ws {ws} should not be 1") if dim % num_heads != 0: raise Exception( - f"dim {dim} should be divided by num_heads {num_heads}.") + "dim {dim} should be divided by num_heads {num_heads}.") self.dim = dim self.num_heads = num_heads @@ -82,11 +82,11 @@ class GroupAttention(nn.Layer): B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads ]).transpose([3, 0, 1, 4, 2, 5]) q, k, v = qkv[0], qkv[1], qkv[2] - attn = (q @k.transpose([0, 1, 2, 4, 3])) * self.scale + attn = (q @ k.transpose([0, 1, 2, 4, 3])) * self.scale attn = nn.Softmax(axis=-1)(attn) attn = self.attn_drop(attn) - attn = (attn @v).transpose([0, 1, 3, 2, 4]).reshape( + attn = (attn @ v).transpose([0, 1, 3, 2, 4]).reshape( [B, h_group, w_group, self.ws, self.ws, C]) x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C]) @@ -147,11 +147,11 @@ class Attention(nn.Layer): [2, 0, 3, 1, 4]) k, v = kv[0], kv[1] - attn = (q @k.transpose([0, 1, 3, 2])) * self.scale + attn = (q @ k.transpose([0, 1, 3, 2])) * self.scale attn = nn.Softmax(axis=-1)(attn) attn = self.attn_drop(attn) - x = (attn @v).transpose([0, 2, 1, 3]).reshape([B, N, C]) + x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, C]) x = self.proj(x) x = self.proj_drop(x) return x @@ -281,7 +281,7 @@ class PyramidVisionTransformer(nn.Layer): img_size=224, patch_size=16, in_chans=3, - num_classes=1000, + class_num=1000, embed_dims=[64, 128, 256, 512], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], @@ -295,7 +295,7 @@ class PyramidVisionTransformer(nn.Layer): sr_ratios=[8, 4, 2, 1], block_cls=Block): super().__init__() - self.num_classes = num_classes + self.class_num = class_num self.depths = depths # patch_embed @@ -354,7 +354,7 @@ class PyramidVisionTransformer(nn.Layer): # classification head self.head = nn.Linear(embed_dims[-1], - num_classes) if num_classes > 0 else Identity() + class_num) if class_num > 0 else Identity() # init weights for pos_emb in self.pos_embeds: @@ -433,7 +433,7 @@ class CPVTV2(PyramidVisionTransformer): img_size=224, patch_size=4, in_chans=3, - num_classes=1000, + class_num=1000, embed_dims=[64, 128, 256, 512], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], @@ -446,10 +446,10 @@ class CPVTV2(PyramidVisionTransformer): depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], block_cls=Block): - super().__init__(img_size, patch_size, in_chans, num_classes, - embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale, - drop_rate, attn_drop_rate, drop_path_rate, norm_layer, - depths, sr_ratios, block_cls) + super().__init__(img_size, patch_size, in_chans, class_num, embed_dims, + num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate, + attn_drop_rate, drop_path_rate, norm_layer, depths, + sr_ratios, block_cls) del self.pos_embeds del self.cls_token self.pos_block = nn.LayerList( @@ -499,7 +499,7 @@ class PCPVT(CPVTV2): img_size=224, patch_size=4, in_chans=3, - num_classes=1000, + class_num=1000, embed_dims=[64, 128, 256], num_heads=[1, 2, 4], mlp_ratios=[4, 4, 4], @@ -512,10 +512,10 @@ class PCPVT(CPVTV2): depths=[4, 4, 4], sr_ratios=[4, 2, 1], block_cls=SBlock): - super().__init__(img_size, patch_size, in_chans, num_classes, - embed_dims, num_heads, mlp_ratios, qkv_bias, qk_scale, - drop_rate, attn_drop_rate, drop_path_rate, norm_layer, - depths, sr_ratios, block_cls) + super().__init__(img_size, patch_size, in_chans, class_num, embed_dims, + num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate, + attn_drop_rate, drop_path_rate, norm_layer, depths, + sr_ratios, block_cls) class ALTGVT(PCPVT):