Merge pull request #1544 from cuicheng01/update_vit

update vit
2021-12-14 14:40:14 +08:00 · 2021-12-14 14:40:14 +08:00 · 897760f073
parent b3922c96f5 f5450c3d66
commit 897760f073
6 changed files with 1 additions and 402 deletions
--- a/ppcls/arch/backbone/init.py
+++ b/ppcls/arch/backbone/init.py
@ -48,7 +48,7 @@ from ppcls.arch.backbone.model_zoo.resnext101_wsl import ResNeXt101_32x8d_wsl, R
 from ppcls.arch.backbone.model_zoo.squeezenet import SqueezeNet1_0, SqueezeNet1_1
 from ppcls.arch.backbone.model_zoo.darknet import DarkNet53
 from ppcls.arch.backbone.model_zoo.regnet import RegNetX_200MF, RegNetX_4GF, RegNetX_32GF, RegNetY_200MF, RegNetY_4GF, RegNetY_32GF
-from ppcls.arch.backbone.model_zoo.vision_transformer import ViT_small_patch16_224, ViT_base_patch16_224, ViT_base_patch16_384, ViT_base_patch32_384, ViT_large_patch16_224, ViT_large_patch16_384, ViT_large_patch32_384, ViT_huge_patch16_224, ViT_huge_patch32_384
+from ppcls.arch.backbone.model_zoo.vision_transformer import ViT_small_patch16_224, ViT_base_patch16_224, ViT_base_patch16_384, ViT_base_patch32_384, ViT_large_patch16_224, ViT_large_patch16_384, ViT_large_patch32_384
 from ppcls.arch.backbone.model_zoo.distilled_vision_transformer import DeiT_tiny_patch16_224, DeiT_small_patch16_224, DeiT_base_patch16_224, DeiT_tiny_distilled_patch16_224, DeiT_small_distilled_patch16_224, DeiT_base_distilled_patch16_224, DeiT_base_patch16_384, DeiT_base_distilled_patch16_384
 from ppcls.arch.backbone.model_zoo.swin_transformer import SwinTransformer_tiny_patch4_window7_224, SwinTransformer_small_patch4_window7_224, SwinTransformer_base_patch4_window7_224, SwinTransformer_base_patch4_window12_384, SwinTransformer_large_patch4_window7_224, SwinTransformer_large_patch4_window12_384
 from ppcls.arch.backbone.model_zoo.mixnet import MixNet_S, MixNet_M, MixNet_L
--- a/ppcls/arch/backbone/model_zoo/vision_transformer.py
+++ b/ppcls/arch/backbone/model_zoo/vision_transformer.py
@ -38,10 +38,6 @@ MODEL_URLS = {
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch16_384_pretrained.pdparams",
    "ViT_large_patch32_384":
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch32_384_pretrained.pdparams",
    "ViT_huge_patch16_224":
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_huge_patch16_224_pretrained.pdparams",
    "ViT_huge_patch32_384":
    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_huge_patch32_384_pretrained.pdparams"
 }
 __all__ = list(MODEL_URLS.keys())
@ -460,36 +456,3 @@ def ViT_large_patch32_384(pretrained=False, use_ssld=False, **kwargs):
        MODEL_URLS["ViT_large_patch32_384"],
        use_ssld=use_ssld)
    return model
 def ViT_huge_patch16_224(pretrained=False, use_ssld=False, **kwargs):
    model = VisionTransformer(
        patch_size=16,
        embed_dim=1280,
        depth=32,
        num_heads=16,
        mlp_ratio=4,
        **kwargs)
    _load_pretrained(
        pretrained,
        model,
        MODEL_URLS["ViT_huge_patch16_224"],
        use_ssld=use_ssld)
    return model
 def ViT_huge_patch32_384(pretrained=False, use_ssld=False, **kwargs):
    model = VisionTransformer(
        img_size=384,
        patch_size=32,
        embed_dim=1280,
        depth=32,
        num_heads=16,
        mlp_ratio=4,
        **kwargs)
    _load_pretrained(
        pretrained,
        model,
        MODEL_URLS["ViT_huge_patch32_384"],
        use_ssld=use_ssld)
    return model
--- a/ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml
+++ b/ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml
@ -1,130 +0,0 @@
 # global configs
 Global:
  checkpoints: null
  pretrained_model: null
  output_dir: ./output/
  device: gpu
  save_interval: 1
  eval_during_train: True
  eval_interval: 1
  epochs: 120
  print_batch_step: 10
  use_visualdl: False
  # used for static mode and model export
  image_shape: [3, 224, 224]
  save_inference_dir: ./inference
 # model architecture
 Arch:
  name:  ViT_huge_patch16_224
  class_num: 1000
 # loss function config for traing/eval process
 Loss:
  Train:
    - CELoss:
        weight: 1.0
  Eval:
    - CELoss:
        weight: 1.0
 Optimizer:
  name: Momentum
  momentum: 0.9
  lr:
    name: Piecewise
    learning_rate: 0.1
    decay_epochs: [30, 60, 90]
    values: [0.1, 0.01, 0.001, 0.0001]
  regularizer:
    name: 'L2'
    coeff: 0.0001
 # data loader for train and eval
 DataLoader:
  Train:
    dataset:
      name: ImageNetDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 224
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
      drop_last: False
      shuffle: True
    loader:
      num_workers: 4
      use_shared_memory: True
  Eval:
    dataset: 
      name: ImageNetDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            resize_short: 256
        - CropImage:
            size: 224
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
      drop_last: False
      shuffle: False
    loader:
      num_workers: 4
      use_shared_memory: True
 Infer:
  infer_imgs: docs/images/whl/demo.jpg
  batch_size: 10
  transforms:
    - DecodeImage:
        to_rgb: True
        channel_first: False
    - ResizeImage:
        resize_short: 256
    - CropImage:
        size: 224
    - NormalizeImage:
        scale: 1.0/255.0
        mean: [0.5, 0.5, 0.5]
        std: [0.5, 0.5, 0.5]
        order: ''
    - ToCHWImage:
  PostProcess:
    name: Topk
    topk: 5
    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
 Metric:
  Train:
    - TopkAcc:
        topk: [1, 5]
  Eval:
    - TopkAcc:
        topk: [1, 5]
--- a/ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml
+++ b/ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml
@ -1,130 +0,0 @@
 # global configs
 Global:
  checkpoints: null
  pretrained_model: null
  output_dir: ./output/
  device: gpu
  save_interval: 1
  eval_during_train: True
  eval_interval: 1
  epochs: 120
  print_batch_step: 10
  use_visualdl: False
  # used for static mode and model export
  image_shape: [3, 384, 384]
  save_inference_dir: ./inference
 # model architecture
 Arch:
  name: ViT_huge_patch32_384
  class_num: 1000
 # loss function config for traing/eval process
 Loss:
  Train:
    - CELoss:
        weight: 1.0
  Eval:
    - CELoss:
        weight: 1.0
 Optimizer:
  name: Momentum
  momentum: 0.9
  lr:
    name: Piecewise
    learning_rate: 0.1
    decay_epochs: [30, 60, 90]
    values: [0.1, 0.01, 0.001, 0.0001]
  regularizer:
    name: 'L2'
    coeff: 0.0001
 # data loader for train and eval
 DataLoader:
  Train:
    dataset:
      name: ImageNetDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - RandCropImage:
            size: 384
        - RandFlipImage:
            flip_code: 1
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
      drop_last: False
      shuffle: True
    loader:
      num_workers: 4
      use_shared_memory: True
  Eval:
    dataset: 
      name: ImageNetDataset
      image_root: ./dataset/ILSVRC2012/
      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
      transform_ops:
        - DecodeImage:
            to_rgb: True
            channel_first: False
        - ResizeImage:
            resize_short: 384
        - CropImage:
            size: 384
        - NormalizeImage:
            scale: 1.0/255.0
            mean: [0.5, 0.5, 0.5]
            std: [0.5, 0.5, 0.5]
            order: ''
    sampler:
      name: DistributedBatchSampler
      batch_size: 64
      drop_last: False
      shuffle: False
    loader:
      num_workers: 4
      use_shared_memory: True
 Infer:
  infer_imgs: docs/images/whl/demo.jpg
  batch_size: 10
  transforms:
    - DecodeImage:
        to_rgb: True
        channel_first: False
    - ResizeImage:
        resize_short: 384
    - CropImage:
        size: 384
    - NormalizeImage:
        scale: 1.0/255.0
        mean: [0.5, 0.5, 0.5]
        std: [0.5, 0.5, 0.5]
        order: ''
    - ToCHWImage:
  PostProcess:
    name: Topk
    topk: 5
    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
 Metric:
  Train:
    - TopkAcc:
        topk: [1, 5]
  Eval:
    - TopkAcc:
        topk: [1, 5]
--- a/test_tipc/config/VisionTransformer/ViT_huge_patch16_224_train_infer_python.txt
+++ b/test_tipc/config/VisionTransformer/ViT_huge_patch16_224_train_infer_python.txt
@ -1,52 +0,0 @@
 ===========================train_params===========================
 model_name:ViT_huge_patch16_224
 python:python3.7
 gpu_list:0|0,1
 -o Global.device:gpu
 -o Global.auto_cast:null
 -o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
 -o Global.output_dir:./output/
 -o DataLoader.Train.sampler.batch_size:8
 -o Global.pretrained_model:null
 train_model_name:latest
 train_infer_img_dir:./dataset/ILSVRC2012/val
 null:null
 ##
 trainer:norm_train
 norm_train:tools/train.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
 pact_train:null
 fpgm_train:null
 distill_train:null
 null:null
 null:null
 ##
 ===========================eval_params=========================== 
 eval:tools/eval.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml
 null:null
 ##
 ===========================infer_params==========================
 -o Global.save_inference_dir:./inference
 -o Global.pretrained_model:
 norm_export:tools/export_model.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch16_224.yaml
 quant_export:null
 fpgm_export:null
 distill_export:null
 kl_quant:null
 export2:null
 pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_huge_patch16_224_pretrained.pdparams
 infer_model:../inference/
 infer_export:True
 infer_quant:Fasle
 inference:python/predict_cls.py -c configs/inference_cls.yaml
 -o Global.use_gpu:True|False
 -o Global.enable_mkldnn:True|False
 -o Global.cpu_num_threads:1|6
 -o Global.batch_size:1|16
 -o Global.use_tensorrt:True|False
 -o Global.use_fp16:True|False
 -o Global.inference_model_dir:../inference
 -o Global.infer_imgs:../dataset/ILSVRC2012/val
 -o Global.save_log_path:null
 -o Global.benchmark:True
 null:null
 null:null
--- a/test_tipc/config/VisionTransformer/ViT_huge_patch32_384_train_infer_python.txt
+++ b/test_tipc/config/VisionTransformer/ViT_huge_patch32_384_train_infer_python.txt
@ -1,52 +0,0 @@
 ===========================train_params===========================
 model_name:ViT_huge_patch32_384
 python:python3.7
 gpu_list:0|0,1
 -o Global.device:gpu
 -o Global.auto_cast:null
 -o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120
 -o Global.output_dir:./output/
 -o DataLoader.Train.sampler.batch_size:8
 -o Global.pretrained_model:null
 train_model_name:latest
 train_infer_img_dir:./dataset/ILSVRC2012/val
 null:null
 ##
 trainer:norm_train
 norm_train:tools/train.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False
 pact_train:null
 fpgm_train:null
 distill_train:null
 null:null
 null:null
 ##
 ===========================eval_params=========================== 
 eval:tools/eval.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml
 null:null
 ##
 ===========================infer_params==========================
 -o Global.save_inference_dir:./inference
 -o Global.pretrained_model:
 norm_export:tools/export_model.py -c ppcls/configs/ImageNet/VisionTransformer/ViT_huge_patch32_384.yaml
 quant_export:null
 fpgm_export:null
 distill_export:null
 kl_quant:null
 export2:null
 pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_huge_patch32_384_pretrained.pdparams
 infer_model:../inference/
 infer_export:True
 infer_quant:Fasle
 inference:python/predict_cls.py -c configs/inference_cls.yaml -o PreProcess.transform_ops.0.ResizeImage.resize_short=384 -o PreProcess.transform_ops.1.CropImage.size=384
 -o Global.use_gpu:True|False
 -o Global.enable_mkldnn:True|False
 -o Global.cpu_num_threads:1|6
 -o Global.batch_size:1|16
 -o Global.use_tensorrt:True|False
 -o Global.use_fp16:True|False
 -o Global.inference_model_dir:../inference
 -o Global.infer_imgs:../dataset/ILSVRC2012/val
 -o Global.save_log_path:null
 -o Global.benchmark:True
 null:null
 null:null