add MobileViTv3
parent
df31d808fc
commit
dc4fdba0ab
|
@ -78,6 +78,7 @@ from .model_zoo.cae import cae_base_patch16_224, cae_large_patch16_224
|
|||
from .model_zoo.cvt import CvT_13_224, CvT_13_384, CvT_21_224, CvT_21_384, CvT_W24_384
|
||||
from .model_zoo.micronet import MicroNet_M0, MicroNet_M1, MicroNet_M2, MicroNet_M3
|
||||
from .model_zoo.mobilenext import MobileNeXt_x0_35, MobileNeXt_x0_5, MobileNeXt_x0_75, MobileNeXt_x1_0, MobileNeXt_x1_4
|
||||
from .model_zoo.mobilevit_v3 import MobileViTv3_XXS, MobileViTv3_XS, MobileViTv3_S, MobileViTv3_x0_5, MobileViTv3_x0_75, MobileViTv3_x1_0
|
||||
|
||||
from .variant_models.resnet_variant import ResNet50_last_stage_stride1
|
||||
from .variant_models.resnet_variant import ResNet50_adaptive_max_pool2d
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,152 @@
|
|||
# global configs
|
||||
Global:
|
||||
checkpoints: null
|
||||
pretrained_model: null
|
||||
output_dir: ./output/
|
||||
device: gpu
|
||||
save_interval: 1
|
||||
eval_during_train: True
|
||||
eval_interval: 1
|
||||
epochs: 300
|
||||
print_batch_step: 10
|
||||
use_visualdl: False
|
||||
# used for static mode and model export
|
||||
image_shape: [3, 256, 256]
|
||||
save_inference_dir: ./inference
|
||||
use_dali: False
|
||||
|
||||
# mixed precision training
|
||||
AMP:
|
||||
scale_loss: 65536
|
||||
use_dynamic_loss_scaling: True
|
||||
# O1: mixed fp16
|
||||
level: O1
|
||||
|
||||
# model ema
|
||||
EMA:
|
||||
decay: 0.9995
|
||||
|
||||
# model architecture
|
||||
Arch:
|
||||
name: MobileViTv3_S
|
||||
class_num: 1000
|
||||
dropout: 0.1
|
||||
|
||||
# loss function config for traing/eval process
|
||||
Loss:
|
||||
Train:
|
||||
- CELoss:
|
||||
weight: 1.0
|
||||
epsilon: 0.1
|
||||
Eval:
|
||||
- CELoss:
|
||||
weight: 1.0
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
epsilon: 1e-8
|
||||
weight_decay: 0.01
|
||||
lr:
|
||||
# for 8 cards
|
||||
name: Cosine
|
||||
learning_rate: 0.002
|
||||
eta_min: 0.0002
|
||||
warmup_epoch: 1 # 3000 iterations
|
||||
warmup_start_lr: 0.0002
|
||||
# by_epoch: True
|
||||
|
||||
# data loader for train and eval
|
||||
DataLoader:
|
||||
Train:
|
||||
dataset:
|
||||
name: MultiScaleDataset
|
||||
image_root: ./dataset/ILSVRC2012/
|
||||
cls_label_path: ./dataset/ILSVRC2012/train_list.txt
|
||||
transform_ops:
|
||||
- DecodeImage:
|
||||
to_rgb: True
|
||||
channel_first: False
|
||||
- RandCropImage:
|
||||
size: 256
|
||||
interpolation: bilinear
|
||||
use_log_aspect: True
|
||||
- RandFlipImage:
|
||||
flip_code: 1
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
# support to specify width and height respectively:
|
||||
# scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
|
||||
sampler:
|
||||
name: MultiScaleSampler
|
||||
scales: [256, 160, 192, 224, 288, 320]
|
||||
# first_bs: batch size for the first image resolution in the scales list
|
||||
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
||||
first_bs: 48
|
||||
divided_factor: 32
|
||||
is_training: True
|
||||
loader:
|
||||
num_workers: 4
|
||||
use_shared_memory: True
|
||||
Eval:
|
||||
dataset:
|
||||
name: ImageNetDataset
|
||||
image_root: ./dataset/ILSVRC2012/
|
||||
cls_label_path: ./dataset/ILSVRC2012/val_list.txt
|
||||
transform_ops:
|
||||
- DecodeImage:
|
||||
to_rgb: False
|
||||
channel_first: False
|
||||
- ResizeImage:
|
||||
resize_short: 288
|
||||
interpolation: bilinear
|
||||
- CropImage:
|
||||
size: 256
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
sampler:
|
||||
name: DistributedBatchSampler
|
||||
batch_size: 48
|
||||
drop_last: False
|
||||
shuffle: False
|
||||
loader:
|
||||
num_workers: 4
|
||||
use_shared_memory: True
|
||||
|
||||
Infer:
|
||||
infer_imgs: docs/images/inference_deployment/whl_demo.jpg
|
||||
batch_size: 10
|
||||
transforms:
|
||||
- DecodeImage:
|
||||
to_rgb: True
|
||||
channel_first: False
|
||||
- ResizeImage:
|
||||
resize_short: 288
|
||||
interpolation: bilinear
|
||||
- CropImage:
|
||||
size: 256
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
- ToCHWImage:
|
||||
PostProcess:
|
||||
name: Topk
|
||||
topk: 5
|
||||
class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
|
||||
|
||||
Metric:
|
||||
Train:
|
||||
- TopkAcc:
|
||||
topk: [1, 5]
|
||||
Eval:
|
||||
- TopkAcc:
|
||||
topk: [1, 5]
|
|
@ -0,0 +1,152 @@
|
|||
# global configs
|
||||
Global:
|
||||
checkpoints: null
|
||||
pretrained_model: null
|
||||
output_dir: ./output/
|
||||
device: gpu
|
||||
save_interval: 1
|
||||
eval_during_train: True
|
||||
eval_interval: 1
|
||||
epochs: 300
|
||||
print_batch_step: 10
|
||||
use_visualdl: False
|
||||
# used for static mode and model export
|
||||
image_shape: [3, 256, 256]
|
||||
save_inference_dir: ./inference
|
||||
use_dali: False
|
||||
|
||||
# mixed precision training
|
||||
AMP:
|
||||
scale_loss: 65536
|
||||
use_dynamic_loss_scaling: True
|
||||
# O1: mixed fp16
|
||||
level: O1
|
||||
|
||||
# model ema
|
||||
EMA:
|
||||
decay: 0.9995
|
||||
|
||||
# model architecture
|
||||
Arch:
|
||||
name: MobileViTv3_XS
|
||||
class_num: 1000
|
||||
dropout: 0.1
|
||||
|
||||
# loss function config for traing/eval process
|
||||
Loss:
|
||||
Train:
|
||||
- CELoss:
|
||||
weight: 1.0
|
||||
epsilon: 0.1
|
||||
Eval:
|
||||
- CELoss:
|
||||
weight: 1.0
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
epsilon: 1e-8
|
||||
weight_decay: 0.01
|
||||
lr:
|
||||
# for 8 cards
|
||||
name: Cosine
|
||||
learning_rate: 0.002
|
||||
eta_min: 0.0002
|
||||
warmup_epoch: 1 # 3000 iterations
|
||||
warmup_start_lr: 0.0002
|
||||
# by_epoch: True
|
||||
|
||||
# data loader for train and eval
|
||||
DataLoader:
|
||||
Train:
|
||||
dataset:
|
||||
name: MultiScaleDataset
|
||||
image_root: ./dataset/ILSVRC2012/
|
||||
cls_label_path: ./dataset/ILSVRC2012/train_list.txt
|
||||
transform_ops:
|
||||
- DecodeImage:
|
||||
to_rgb: True
|
||||
channel_first: False
|
||||
- RandCropImage:
|
||||
size: 256
|
||||
interpolation: bilinear
|
||||
use_log_aspect: True
|
||||
- RandFlipImage:
|
||||
flip_code: 1
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
# support to specify width and height respectively:
|
||||
# scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
|
||||
sampler:
|
||||
name: MultiScaleSampler
|
||||
scales: [256, 160, 192, 224, 288, 320]
|
||||
# first_bs: batch size for the first image resolution in the scales list
|
||||
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
||||
first_bs: 48
|
||||
divided_factor: 32
|
||||
is_training: True
|
||||
loader:
|
||||
num_workers: 4
|
||||
use_shared_memory: True
|
||||
Eval:
|
||||
dataset:
|
||||
name: ImageNetDataset
|
||||
image_root: ./dataset/ILSVRC2012/
|
||||
cls_label_path: ./dataset/ILSVRC2012/val_list.txt
|
||||
transform_ops:
|
||||
- DecodeImage:
|
||||
to_rgb: False
|
||||
channel_first: False
|
||||
- ResizeImage:
|
||||
resize_short: 288
|
||||
interpolation: bilinear
|
||||
- CropImage:
|
||||
size: 256
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
sampler:
|
||||
name: DistributedBatchSampler
|
||||
batch_size: 48
|
||||
drop_last: False
|
||||
shuffle: False
|
||||
loader:
|
||||
num_workers: 4
|
||||
use_shared_memory: True
|
||||
|
||||
Infer:
|
||||
infer_imgs: docs/images/inference_deployment/whl_demo.jpg
|
||||
batch_size: 10
|
||||
transforms:
|
||||
- DecodeImage:
|
||||
to_rgb: True
|
||||
channel_first: False
|
||||
- ResizeImage:
|
||||
resize_short: 288
|
||||
interpolation: bilinear
|
||||
- CropImage:
|
||||
size: 256
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
- ToCHWImage:
|
||||
PostProcess:
|
||||
name: Topk
|
||||
topk: 5
|
||||
class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
|
||||
|
||||
Metric:
|
||||
Train:
|
||||
- TopkAcc:
|
||||
topk: [1, 5]
|
||||
Eval:
|
||||
- TopkAcc:
|
||||
topk: [1, 5]
|
|
@ -0,0 +1,152 @@
|
|||
# global configs
|
||||
Global:
|
||||
checkpoints: null
|
||||
pretrained_model: null
|
||||
output_dir: ./output/
|
||||
device: gpu
|
||||
save_interval: 1
|
||||
eval_during_train: True
|
||||
eval_interval: 1
|
||||
epochs: 300
|
||||
print_batch_step: 10
|
||||
use_visualdl: False
|
||||
# used for static mode and model export
|
||||
image_shape: [3, 256, 256]
|
||||
save_inference_dir: ./inference
|
||||
use_dali: False
|
||||
|
||||
# mixed precision training
|
||||
AMP:
|
||||
scale_loss: 65536
|
||||
use_dynamic_loss_scaling: True
|
||||
# O1: mixed fp16
|
||||
level: O1
|
||||
|
||||
# model ema
|
||||
EMA:
|
||||
decay: 0.9995
|
||||
|
||||
# model architecture
|
||||
Arch:
|
||||
name: MobileViTv3_XXS
|
||||
class_num: 1000
|
||||
dropout: 0.05
|
||||
|
||||
# loss function config for traing/eval process
|
||||
Loss:
|
||||
Train:
|
||||
- CELoss:
|
||||
weight: 1.0
|
||||
epsilon: 0.1
|
||||
Eval:
|
||||
- CELoss:
|
||||
weight: 1.0
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
epsilon: 1e-8
|
||||
weight_decay: 0.01
|
||||
lr:
|
||||
# for 8 cards
|
||||
name: Cosine
|
||||
learning_rate: 0.002
|
||||
eta_min: 0.0002
|
||||
warmup_epoch: 1 # 3000 iterations
|
||||
warmup_start_lr: 0.0002
|
||||
# by_epoch: True
|
||||
|
||||
# data loader for train and eval
|
||||
DataLoader:
|
||||
Train:
|
||||
dataset:
|
||||
name: MultiScaleDataset
|
||||
image_root: ./dataset/ILSVRC2012/
|
||||
cls_label_path: ./dataset/ILSVRC2012/train_list.txt
|
||||
transform_ops:
|
||||
- DecodeImage:
|
||||
to_rgb: True
|
||||
channel_first: False
|
||||
- RandCropImage:
|
||||
size: 256
|
||||
interpolation: bilinear
|
||||
use_log_aspect: True
|
||||
- RandFlipImage:
|
||||
flip_code: 1
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
# support to specify width and height respectively:
|
||||
# scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
|
||||
sampler:
|
||||
name: MultiScaleSampler
|
||||
scales: [256, 160, 192, 224, 288, 320]
|
||||
# first_bs: batch size for the first image resolution in the scales list
|
||||
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
||||
first_bs: 48
|
||||
divided_factor: 32
|
||||
is_training: True
|
||||
loader:
|
||||
num_workers: 4
|
||||
use_shared_memory: True
|
||||
Eval:
|
||||
dataset:
|
||||
name: ImageNetDataset
|
||||
image_root: ./dataset/ILSVRC2012/
|
||||
cls_label_path: ./dataset/ILSVRC2012/val_list.txt
|
||||
transform_ops:
|
||||
- DecodeImage:
|
||||
to_rgb: False
|
||||
channel_first: False
|
||||
- ResizeImage:
|
||||
resize_short: 288
|
||||
interpolation: bilinear
|
||||
- CropImage:
|
||||
size: 256
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
sampler:
|
||||
name: DistributedBatchSampler
|
||||
batch_size: 48
|
||||
drop_last: False
|
||||
shuffle: False
|
||||
loader:
|
||||
num_workers: 4
|
||||
use_shared_memory: True
|
||||
|
||||
Infer:
|
||||
infer_imgs: docs/images/inference_deployment/whl_demo.jpg
|
||||
batch_size: 10
|
||||
transforms:
|
||||
- DecodeImage:
|
||||
to_rgb: True
|
||||
channel_first: False
|
||||
- ResizeImage:
|
||||
resize_short: 288
|
||||
interpolation: bilinear
|
||||
- CropImage:
|
||||
size: 256
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
- ToCHWImage:
|
||||
PostProcess:
|
||||
name: Topk
|
||||
topk: 5
|
||||
class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
|
||||
|
||||
Metric:
|
||||
Train:
|
||||
- TopkAcc:
|
||||
topk: [1, 5]
|
||||
Eval:
|
||||
- TopkAcc:
|
||||
topk: [1, 5]
|
|
@ -0,0 +1,152 @@
|
|||
# global configs
|
||||
Global:
|
||||
checkpoints: null
|
||||
pretrained_model: null
|
||||
output_dir: ./output/
|
||||
device: gpu
|
||||
save_interval: 1
|
||||
eval_during_train: True
|
||||
eval_interval: 1
|
||||
epochs: 300
|
||||
print_batch_step: 10
|
||||
use_visualdl: False
|
||||
# used for static mode and model export
|
||||
image_shape: [3, 256, 256]
|
||||
save_inference_dir: ./inference
|
||||
use_dali: False
|
||||
|
||||
# mixed precision training
|
||||
AMP:
|
||||
scale_loss: 65536
|
||||
use_dynamic_loss_scaling: True
|
||||
# O1: mixed fp16
|
||||
level: O1
|
||||
|
||||
# model ema
|
||||
EMA:
|
||||
decay: 0.9995
|
||||
|
||||
# model architecture
|
||||
Arch:
|
||||
name: MobileViTv3_x0_5
|
||||
class_num: 1000
|
||||
classifier_dropout: 0.
|
||||
|
||||
# loss function config for traing/eval process
|
||||
Loss:
|
||||
Train:
|
||||
- CELoss:
|
||||
weight: 1.0
|
||||
epsilon: 0.1
|
||||
Eval:
|
||||
- CELoss:
|
||||
weight: 1.0
|
||||
|
||||
Optimizer:
|
||||
name: AdamW
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
epsilon: 1e-8
|
||||
weight_decay: 0.01
|
||||
lr:
|
||||
# for 8 cards
|
||||
name: Cosine
|
||||
learning_rate: 0.002
|
||||
eta_min: 0.0002
|
||||
warmup_epoch: 1 # 3000 iterations
|
||||
warmup_start_lr: 0.0002
|
||||
# by_epoch: True
|
||||
|
||||
# data loader for train and eval
|
||||
DataLoader:
|
||||
Train:
|
||||
dataset:
|
||||
name: MultiScaleDataset
|
||||
image_root: ./dataset/ILSVRC2012/
|
||||
cls_label_path: ./dataset/ILSVRC2012/train_list.txt
|
||||
transform_ops:
|
||||
- DecodeImage:
|
||||
to_rgb: True
|
||||
channel_first: False
|
||||
- RandCropImage:
|
||||
size: 256
|
||||
interpolation: bilinear
|
||||
use_log_aspect: True
|
||||
- RandFlipImage:
|
||||
flip_code: 1
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
# support to specify width and height respectively:
|
||||
# scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
|
||||
sampler:
|
||||
name: MultiScaleSampler
|
||||
scales: [256, 160, 192, 224, 288, 320]
|
||||
# first_bs: batch size for the first image resolution in the scales list
|
||||
# divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
|
||||
first_bs: 48
|
||||
divided_factor: 32
|
||||
is_training: True
|
||||
loader:
|
||||
num_workers: 4
|
||||
use_shared_memory: True
|
||||
Eval:
|
||||
dataset:
|
||||
name: ImageNetDataset
|
||||
image_root: ./dataset/ILSVRC2012/
|
||||
cls_label_path: ./dataset/ILSVRC2012/val_list.txt
|
||||
transform_ops:
|
||||
- DecodeImage:
|
||||
to_rgb: False
|
||||
channel_first: False
|
||||
- ResizeImage:
|
||||
resize_short: 288
|
||||
interpolation: bilinear
|
||||
- CropImage:
|
||||
size: 256
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
sampler:
|
||||
name: DistributedBatchSampler
|
||||
batch_size: 48
|
||||
drop_last: False
|
||||
shuffle: False
|
||||
loader:
|
||||
num_workers: 4
|
||||
use_shared_memory: True
|
||||
|
||||
Infer:
|
||||
infer_imgs: docs/images/inference_deployment/whl_demo.jpg
|
||||
batch_size: 10
|
||||
transforms:
|
||||
- DecodeImage:
|
||||
to_rgb: True
|
||||
channel_first: False
|
||||
- ResizeImage:
|
||||
resize_short: 288
|
||||
interpolation: bilinear
|
||||
- CropImage:
|
||||
size: 256
|
||||
- NormalizeImage:
|
||||
scale: 1.0/255.0
|
||||
mean: [0.0, 0.0, 0.0]
|
||||
std: [1.0, 1.0, 1.0]
|
||||
order: ''
|
||||
- ToCHWImage:
|
||||
PostProcess:
|
||||
name: Topk
|
||||
topk: 5
|
||||
class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
|
||||
|
||||
Metric:
|
||||
Train:
|
||||
- TopkAcc:
|
||||
topk: [1, 5]
|
||||
Eval:
|
||||
- TopkAcc:
|
||||
topk: [1, 5]
|
Loading…
Reference in New Issue