70 lines
1.7 KiB
Python
70 lines
1.7 KiB
Python
_base_ = '../_base_/default_runtime.py'
|
|
|
|
# data settings
|
|
data_preprocessor = dict(
|
|
type='MultiModalDataPreprocessor',
|
|
mean=[0.48145466 * 255, 0.4578275 * 255, 0.40821073 * 255],
|
|
std=[0.26862954 * 255, 0.26130258 * 255, 0.27577711 * 255],
|
|
to_rgb=True,
|
|
)
|
|
|
|
test_pipeline = [
|
|
dict(type='LoadImageFromFile'),
|
|
dict(type='Resize', scale=(224, 224), interpolation='bicubic'),
|
|
dict(
|
|
type='PackInputs',
|
|
algorithm_keys=['text'],
|
|
meta_keys=['image_id', 'scale_factor'],
|
|
),
|
|
]
|
|
|
|
train_dataloader = None
|
|
test_dataloader = dict(
|
|
batch_size=32,
|
|
num_workers=8,
|
|
dataset=dict(
|
|
type='ImageNet',
|
|
data_root='data/imagenet',
|
|
split='val',
|
|
pipeline=test_pipeline),
|
|
sampler=dict(type='DefaultSampler', shuffle=False),
|
|
)
|
|
test_evaluator = dict(type='Accuracy', topk=(1, 5))
|
|
|
|
# schedule settings
|
|
train_cfg = None
|
|
val_cfg = None
|
|
test_cfg = dict()
|
|
|
|
# model settings
|
|
model = dict(
|
|
type='CLIPZeroShot',
|
|
vision_backbone=dict(
|
|
type='VisionTransformer',
|
|
arch='base',
|
|
img_size=224,
|
|
patch_size=16,
|
|
drop_rate=0.,
|
|
layer_cfgs=dict(act_cfg=dict(type='QuickGELU')),
|
|
pre_norm=True,
|
|
),
|
|
projection=dict(type='CLIPProjection', in_channels=768, out_channels=512),
|
|
text_backbone=dict(
|
|
type='CLIPTransformer',
|
|
width=512,
|
|
layers=12,
|
|
heads=8,
|
|
attn_mask=True,
|
|
),
|
|
tokenizer=dict(
|
|
type='AutoTokenizer',
|
|
name_or_path='openai/clip-vit-base-patch16',
|
|
use_fast=False),
|
|
vocab_size=49408,
|
|
transformer_width=512,
|
|
proj_dim=512,
|
|
text_prototype='imagenet',
|
|
text_prompt='openai_imagenet_sub', # openai_imagenet, openai_imagenet_sub
|
|
context_length=77,
|
|
)
|